def __init__(self, value_index: NodeValueIndex, fixed_value_offset): # self.node_types = node_types self.analyzer = TypeScriptGraphAnalyzer() self.value_index = value_index self.fixed_value_offset = fixed_value_offset
def __init__(self, expr_gen: ExpressionGenerator): self.name = "AddMethodCallArguments" self.applied_positions = {} self.expr_gen = expr_gen self.null_id = "<null>" self.analyzer = TypeScriptGraphAnalyzer()
def __init__(self, expr_gen: ExpressionGenerator): self.name = "AddExpressionStatement" """ we need to remove modifications in the reverse order they were added given a code, the new statement is added either before of after the existing one. E.g,: stmt1 -> stmt1 expr1 or stmt1 -> expr1 stmt1 where stmt1 is the original statement and expr1 is newly generated """ self.applied_modifications = [] self.applied_positions = set() self.analyzer = TypeScriptGraphAnalyzer() self.null_id = "<null>" self.expr_gen = expr_gen
def __init__(self, expr_gen: ExpressionGenerator): self.name = "AddObjExpressionStatement" self.applied_modifications = [] self.applied_positions = set() self.analyzer = TypeScriptGraphAnalyzer() self.null_id = "<null>" self.expr_gen = expr_gen
def process_ast(tree, idx=0, analyze=False, max_size=3000): excludes = ["<null>"] fields = ["target", "gold"] tree = AstTree.fromJson( tree, analyzer=TypeScriptGraphAnalyzer() if analyze else None, field_names=fields, ) if max_size > 0 and len(tree.nodes) > max_size: return None, None types = [node.type for node in tree.nodes] values = [ str(node.value) if node.value else "<null>" for node in tree.nodes ] target = [node.fields.get("target", "<null>") for node in tree.nodes] mask_valid = [1 if t not in excludes else 0 for t in target] mask_gold = [1 if "gold" in node.fields else 0 for node in tree.nodes] gold_type = [node.fields.get("gold", "<null>") for node in tree.nodes] depth = [node.depth() for node in tree.nodes] pos = [min(16, node.pos_in_parent()) for node in tree.nodes] data = { "id": idx, "ast_values": values, "ast_types": types, "target_full": target, "mask_valid_full": mask_valid, "mask_gold": mask_gold, "gold_type": gold_type, "pos": pos, # debugging "depth": depth, # used by type inference "dependencies": [], } if analyze: per_type_edges = tree.compute_all_edges() for edge_type, values in per_type_edges.items(): data[edge_type + "_src"] = [v[0] for v in values] data[edge_type + "_tgt"] = [v[1] for v in values] return data, tree
def tree_to_data(tree: AstTree, sample): if tree.analyzer is None: tree.analyzer = TypeScriptGraphAnalyzer() """ Recompute the nodes since the tree structure was changed by modifications. """ tree.nodes = list(tree.root.forEachNode()) types = [node.fields.get("types", node.type) for node in tree.nodes] values = [node.fields.get("values", node.value) for node in tree.nodes] depth = [node.depth() for node in tree.nodes] target = [ node.fields.get("target", sample.target[node.id] if node.id >= 0 else "<null>") for node in tree.nodes ] mask_valid = [ node.fields.get("mask_valid", sample.mask_valid[node.id] if node.id >= 0 else 0) for node in tree.nodes ] data = { "id": "{}_mod".format(sample.id), "values": values, "types": types, "target": target, "mask_valid": mask_valid, "depth": depth, "order": sample.order, } tree.number_nodes() per_type_edges = tree.compute_all_edges() for edge_type, values in per_type_edges.items(): data[edge_type + "_src"] = [v[0] for v in values] data[edge_type + "_tgt"] = [v[1] for v in values] return data
class AdversarialNodeReplacement: def __init__(self, value_index: NodeValueIndex, fixed_value_offset): # self.node_types = node_types self.analyzer = TypeScriptGraphAnalyzer() self.value_index = value_index self.fixed_value_offset = fixed_value_offset @staticmethod def property_declaration(node): "private x = ..." return (node.type == "Identifier" and node.pos_in_parent() == 1 and node.has_up() and node.up().type == "PropertyDeclaration") @staticmethod def property_assignment_left(node): "{x: _}" return (node.has_up() and node.up().type == "PropertyAssignment" and node.is_first_child()) @staticmethod def property_access(node): "_.x" return (node.pos_in_parent() == 2 and node.left().type == "DotToken" and node.has_up() and node.up().type == "PropertyAccessExpression") CONSTANTS = set([ "StringLiteral", "TemplateExpression", "FirstTemplateToken", # string "TrueKeyword", "FalseKeyword", # boolean "FirstLiteralToken", # numbers ]) @staticmethod def is_constant(node): return node.type in AdversarialNodeReplacement.CONSTANTS def compute_property_assignment_renaming(self, tree_id, tree: AstTree): rename_nodes = {} declarations = [ node for node in tree.nodes if AdversarialNodeReplacement.property_assignment_left(node) ] decl_scopes = { decl.id: self.analyzer.get_scope(decl) for decl in declarations } sample = self.value_index.dataset.get_sample_by_id(tree_id) for declaration in declarations: scope: AstNode = decl_scopes[declaration.id] rename_nodes[declaration.id] = declaration for node in scope.forEachNode(): if node.value != declaration.value: continue if not AdversarialNodeReplacement.property_access(node): continue target_type = sample.target[node.left().left().id] if not (target_type[0] == "{" and target_type[-1] == "}" and declaration.value in target_type): continue rename_nodes[node.id] = declaration return self.__process_decl(decl_scopes, rename_nodes, tree_id, tree) def compute_constant_replacement(self, tree_id, tree: AstTree): blacklist = ["number", "string", "boolean", "function"] blacklist = ['"' + v + '"' for v in blacklist] + ["'" + v + "'" for v in blacklist] constants = [ node for node in tree.nodes if AdversarialNodeReplacement.is_constant(node) and not AdversarialNodeReplacement.property_assignment_left(node) and node.value not in blacklist ] res = [] # print('-' * 40) # print('Tree: {}'.format(tree_id)) for node in constants: pos = NodeRenameRule( tree_id, node.id, [node.id], [], value="{:3d} {}".format( self.value_index.dataset.VALUES.vocab.stoi[node.value], node.value), candidate_values=self.value_index.values_for_type(node.type), fixed_value_offset=self.fixed_value_offset, ) res.append(pos) return res def compute_prop_declaration_renaming(self, tree_id, tree: AstTree): rename_nodes = {} declarations = [ node for node in tree.nodes if AdversarialNodeReplacement.property_declaration(node) ] decl_scopes = { decl.id: self.analyzer.get_scope(decl) for decl in declarations } sample = self.value_index.dataset.get_sample_by_id(tree_id) assert sample is not None def get_scope_type(node: AstNode): if node.type == "ClassExpression": raw_type = sample.target[node.id] else: node = node.down_first() while node.type not in ["ClassKeyword", "InterfaceKeyword"]: if not node.has_right(): break node = node.right() raw_type = sample.target[node.id] if "typeof" in raw_type: """ converts "'typeof Foo'" to ["'typeof Foo'", "Foo"] """ return [raw_type, raw_type.split(" ")[-1]] # assert 'typeof' in raw_type, '{}\n{}, raw_type: {}'.format(base_node, node, raw_type) return [raw_type] for declaration in declarations: scope: AstNode = decl_scopes[declaration.id] assert scope.type in [ "ClassDeclaration", "InterfaceDeclaration", "ClassExpression", ], (str(scope) + "\n" + tree.dumpAsString()) allowed_types = get_scope_type(scope) + ["any"] # replace all occurrences that match the class type in the whole file scope = tree.root rename_nodes[declaration.id] = declaration for node in scope.forEachNode(): if node.value != declaration.value: continue if not AdversarialNodeReplacement.property_access(node): continue if sample.target[node.left().left().id] not in allowed_types: # Logger.debug('skipping with {} wrong type {} != {} (scope: {})'.format( # node.id, sample.target[node.left().left().id], allowed_types, scope.id)) # input() continue rename_nodes[node.id] = declaration return self.__process_decl(decl_scopes, rename_nodes, tree_id, tree) def compute_variable_renaming(self, tree_id, tree: AstTree): rename_nodes = {} declarations = [ node for node in tree.nodes if self.analyzer.is_declaration(node) and not node.type.endswith("Keyword") ] decl_scopes = { decl.id: self.analyzer.get_scope(decl) for decl in declarations } for declaration in declarations: scope: AstNode = decl_scopes[ declaration.id] # self.analyzer.get_scope(declaration) for node in scope.forEachNode(): if node.value != declaration.value: continue if AdversarialNodeReplacement.property_access(node): continue if AdversarialNodeReplacement.property_declaration(node): continue """ all property assignments are strings, even if they have the same name e.g. {foo: 5} is equivalent to {'foo': 5} regardless of whether variable named foo exists in the scope """ if AdversarialNodeReplacement.property_assignment_left(node): continue if self.analyzer.method_decl_name(node): continue if node.type.endswith("Keyword"): continue rename_nodes[node.id] = declaration return self.__process_decl(decl_scopes, rename_nodes, tree_id, tree) def __process_decl(self, decl_scopes, rename_nodes, tree_id, tree: AstTree): """ Int -> List[Int] Stores for each declaration list of other declarations that are in the same scope Each renaming should preserve the invariant the variable names in the same scope do not conflict with each other. Note that it is possible that the same variable name is declared multiple times, e.g.: var x = ... ... var x = ... in this case, we merge the rules into a single rules that always renames both variables into the same value. This is a safe option (compared to removing the conflicting constraints) that keeps the program semantics unchanged. """ merged_declarations = {} per_scope_decls = collections.defaultdict(set) def try_merge_declaration(idx, other_decl_ids): for other_decl_id in other_decl_ids: if tree.nodes[idx].value == tree.nodes[other_decl_id].value: merged_declarations[idx] = other_decl_id return True return False for decl_id, scope in decl_scopes.items(): if not try_merge_declaration(decl_id, per_scope_decls[scope.id]): per_scope_decls[scope.id].add(decl_id) # print('per_scope_decls', per_scope_decls) per_decl_conflits = collections.defaultdict(set) for scope_id, decl_ids in per_scope_decls.items(): for decl_id in decl_ids: per_decl_conflits[decl_id].update(decl_ids - {decl_id}) """ Note that the declaration can shadow each other in which case rename_nodes can be reassigned to match the inner most scope. As a result, the per_decl_usages are computed only at the end per_decl_usages: Int -> List[Int] Map from a declaration site to all usage positions """ per_decl_usages = collections.defaultdict(set) for node_id, declaration in rename_nodes.items(): # print(node_id, declaration.id) merged_declaration_id = merged_declarations.get( declaration.id, declaration.id) per_decl_usages[merged_declaration_id].add(node_id) # print('merged_decl', merged_declarations) """ Consistency Check """ def check_consistency(): for decl_id, conflicts in per_decl_conflits.items(): assert all( tree.nodes[idx].value != tree.nodes[decl_id].value for idx in conflicts), "conflicts not satisfiable {}: {}".format( decl_id, ",".join("{}: {}".format(idx, tree.nodes[idx].value) for idx in conflicts), ) assert all(decl_id in usage_ids for decl_id, usage_ids in per_decl_usages.items()) for decl_id, usage_ids in per_decl_usages.items(): decl = tree.nodes[decl_id] assert all(tree.nodes[uid].value == decl.value and tree.nodes[uid].type == decl.type for uid in usage_ids), "{} vs {}".format( decl, " ".join( str(tree.nodes[uid]) for uid in usage_ids)) dataset = self.value_index.dataset sample = dataset.id_to_sample[tree_id] for decl_id, node_ids in per_decl_usages.items(): if (dataset.TARGET.vocab.stoi[sample.target[decl_id]] == dataset.unk_token_id): continue """ Currently, the cases that fail the following consistency check are either due to: - type refinement - assignment of different type. This is often done by shadowing variable from the same scope. e.g.: def foo(x: int): x = 'test' """ if not all(sample.target[node_id] == sample.target[decl_id] or sample.target[node_id] == "never" # type refinement in switch statement for node_id in node_ids): # print(tree.root.dumpAsString(label=node_label)) # print('{} {}'.format(decl_id, node_ids)) # input() # num_inconsistencies += 1 pass check_consistency() res = [] for decl_id, usage_ids in per_decl_usages.items(): decl = tree.nodes[decl_id] candidate_values = self.value_index.values_for_type(decl.type) value = "{:3d} {}".format( self.value_index.dataset.VALUES.vocab.stoi[decl.value], decl.value) pos = NodeRenameRule( tree_id, decl_id, usage_ids, per_decl_conflits[decl_id], value=value, candidate_values=candidate_values, decl_id=decl_id, fixed_value_offset=self.fixed_value_offset, ) res.append(pos) return res def make_rules(self, dataset: Dataset, trees: Dict[int, AstTree], trees_num: Dict[int, AstTree]): num_valid = 0 num_pos = 0 index = RenameRulesIndex() for tree_id, tree in trees.items(): sample = dataset.id_to_sample[tree_id] rules = RenameRulesForTree(tree_id, trees_num[tree_id]) try: rules.update(self.compute_constant_replacement(tree_id, tree)) rules.update(self.compute_variable_renaming(tree_id, tree)) rules.update( self.compute_prop_declaration_renaming(tree_id, tree)) rules.update( self.compute_property_assignment_renaming(tree_id, tree)) index.add(rules) except Exception as e: print( tree.root.dumpAsString(label=lambda node: "{:<20s}".format( trim(sample.target[node.id], 20)))) raise e num_valid += sum(sample.mask_valid) for valid, node in zip(sample.mask_valid, tree.nodes): if valid and node.id in rules.nodes_to_rule: num_pos += 1 def node_label(node: AstNode): pos = rules.nodes_to_rule.get(node.id, None) s = "{:<20s}".format(trim(sample.target[node.id], 20)) if pos is not None: return s + "{:>6s} {:6d} {} {}".format( str(pos.decl_id), len(pos.candidate_values), pos.usages, [ self.value_index.dataset.VALUES.vocab.itos[c] for c in pos.candidate_values ], ) return s print("{:7d}/{:7d} ({:.2f}%)".format(num_pos, num_valid, acc(num_pos, num_valid))) return index
def __init__(self, expr_gen: ExpressionGenerator): self.analyzer = TypeScriptGraphAnalyzer() self.expr_gen = expr_gen
def __init__(self): self.name = "AddFunctionArgument" self.applied_positions = {} self.analyzer = TypeScriptGraphAnalyzer() self.null_id = "<null>"
class AddFunctionArgumentRule: """ adds additional unused function arguments function foo(x) { ... } -> function foo(x, arg1, arg2) { ... } """ def __init__(self): self.name = "AddFunctionArgument" self.applied_positions = {} self.analyzer = TypeScriptGraphAnalyzer() self.null_id = "<null>" def revert_all_changes(self): for original_node, swapped_node in self.applied_positions.values(): swapped_node.swapNodes(original_node) self.applied_positions.clear() def matches(self, node: AstNode): return self.analyzer.is_function_def(node) def apply(self, tree_id, node: AstNode): assert self.matches(node) def find_params(function_def: AstNode): assert function_def.down_first() is not None c = function_def.down_first() while c.type != "OpenParenToken": c = c.right() assert c.has_right() and c.right().type == "SyntaxList" return c.right() params = find_params(node) new_params = params.deepCopy(parent=params.parent) for i in range(random.randint(1, 3)): param = AstNode(idx=-1, type="Parameter", value=self.null_id) # TODO: enable adversarial modifications of the added identifier ident = AstNode(idx=-1, type="Identifier", value="param{}".format(i)) param.add_child(ident) if new_params.children: comma_token = AstNode(idx=-1, type="CommaToken", value=",") new_params.add_child(comma_token) new_params.add_child(param) assert params.id != -1 key = (tree_id, params.id) assert key not in self.applied_positions "remember where the change was applied such that it can be reverted later" self.applied_positions[key] = (params, new_params) params.swapNodes(new_params)
class AddMethodCallArgumentsRule: """ adds additional unused method call arguments console.log("hello world!") -> console.log("hello world!", expr1, expr2) where expr1, expr2 are randomly generated expressions. In JavaScript, additional arguments are ignored at runtime. Note however this change is not semantic preserving since the method could declare additional parameters in which case the generated expressions will be used. """ def __init__(self, expr_gen: ExpressionGenerator): self.name = "AddMethodCallArguments" self.applied_positions = {} self.expr_gen = expr_gen self.null_id = "<null>" self.analyzer = TypeScriptGraphAnalyzer() def revert_all_changes(self): for original_node, swapped_node in self.applied_positions.values(): swapped_node.swapNodes(original_node) self.applied_positions.clear() def matches(self, node: AstNode): return self.analyzer.is_function_def(node) """ =================== 146 CallExpression ==================== 147 PropertyAccessExpr.. ===================== 148 Identifier div any div ===================== 149 DotToken . . ===================== 150 Identifier getText getText ==================== 151 OpenParenToken ( ( ==================== 152 SyntaxList ==================== 153 CloseParenToken ) ) =================== 122 CallExpression ==================== 123 PropertyAccessExpr.. ===================== 124 Identifier by any <unk> ===================== 125 DotToken . . ===================== 126 Identifier css css ==================== 127 OpenParenToken ( ( ==================== 128 SyntaxList ===================== 129 StringLiteral 'input' string 'input' ==================== 130 CloseParenToken ) ) """ def apply(self, tree_id, node: AstNode): assert self.matches(node) def find_args(function_def: AstNode): assert function_def.down_first() is not None c = function_def.down_first() while c.type != "OpenParenToken": c = c.right() assert c.has_right() and c.right().type == "SyntaxList" return c.right() args = find_args(node) new_args = args.deepCopy(parent=args.parent) for i in range(random.randint(1, 2)): if new_args.children: comma_token = AstNode(idx=-1, type="CommaToken", value=",") new_args.add_child(comma_token) expr = self.expr_gen.gen_bin_expr(depth=random.randint(0, 2)) new_args.add_child(expr) assert args.id != -1 key = (tree_id, args.id) assert key not in self.applied_positions "remember where the change was applied such that it can be reverted later" self.applied_positions[key] = (args, new_args) args.swapNodes(new_args)
def main(): with gzip.open( os.path.expanduser("../../../data/deeptyperast_1k/train.json.gz"), "rb") as f: for line in f: data = json.loads(line) # Print the content for key, values in data.items(): print(key, values) """ Each json samples corresponds to a single source file. The json format is as follows (printed by running the above): # path of the source file, including the project 'id': 'SharePoint/sp-dev-fx-webparts/ICustomBusinessObjectsPnPJsState.ts' # ast values 'ast_values': ['<null>', '<null>', '<null>', 'import', '<null>', '<null>', '{', '<null>', '<null>', 'MyDocument', '}', 'from', '"../model/MyDocument"', ';', '<null>', '<null>', 'export', 'interface', 'ICustomBusinessObjectsPnPJsState', '{', '<null>', '<null>', 'myDocuments', ';', '<null>', 'errors', ';', '}'] # ast types: 'ast_types': ['SourceFile', 'SyntaxList', 'ImportDeclaration', 'ImportKeyword', 'ImportClause', 'NamedImports', 'FirstPunctuation', 'SyntaxList', 'ImportSpecifier', 'Identifier', 'CloseBraceToken', 'FromKeyword', 'StringLiteral', 'SemicolonToken', 'InterfaceDeclaration', 'SyntaxList', 'ExportKeyword', 'InterfaceKeyword', 'Identifier', 'FirstPunctuation', 'SyntaxList', 'PropertySignature', 'Identifier', 'SemicolonToken', 'PropertySignature', 'Identifier', 'SemicolonToken', 'CloseBraceToken'] # inferred type when running typescript analyzer on the full project target_full ['<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', 'typeof MyDocument', 'typeof MyDocument', '<null>', '<null>', 'string', '<null>', 'ICustomBusinessObjectsPnPJsState', '<null>', '<null>', '<null>', 'any', '<null>', '<null>', 'MyDocument[]', 'MyDocument[]', '<null>', 'string[]', 'string[]', '<null>', '<null>'] # mask containing 1 for locations with inferred types mask_valid_full [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0] # mask containing 1 for locations manually annotated by users mask_gold [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0] # user annotations, note that it does not necessarily matches the inferred type gold_type ['<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', '<null>', 'MyDocument[]', '<null>', '<null>', 'string[]', '<null>', '<null>'] # position in the parent. 0 denotes the first child, 1 the second child, etc. pos [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 2, 3, 4, 1, 0, 0, 1, 2, 3, 4, 0, 0, 1, 1, 0, 1, 5] # depth in the tree. 0 denotes the root depth [0, 1, 2, 3, 3, 4, 5, 5, 6, 7, 5, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 4, 5, 5, 4, 5, 5, 3] # dependencies used by type inference to obtain ground truth files. # useful when the type inference file is run as part of the evaluation/training to make it faster dependencies ['Repos/SharePoint/sp-dev-fx-webparts/samples/react-sp-pnp-js-property-decorators/src/webparts/customBusinessObjectsPnPJs/components/ICustomBusinessObjectsPnPJsState.ts', 'SharePoint/sp-dev-fx-webparts/samples/react-sp-pnp-js-property-decorators/src/webparts/customBusinessObjectsPnPJs/model/MyDocument.ts', 'typescript/node_modules/typescript/lib/lib.es5.d.ts'] # various edge types supported by the current analyzer child_edges_src [0, 1, 1, 2, 2, 2, 2, 2, 4, 5, 5, 5, 7, 8, 14, 14, 14, 14, 14, 14, 15, 20, 20, 21, 21, 24, 24] child_edges_tgt [1, 2, 14, 3, 4, 11, 12, 13, 5, 6, 7, 10, 8, 9, 15, 17, 18, 19, 20, 27, 16, 21, 24, 22, 23, 25, 26] next_token_edges_src [3, 6, 9, 10, 11, 12, 13, 16, 17, 18, 19, 22, 23, 25, 26] next_token_edges_tgt [6, 9, 10, 11, 12, 13, 16, 17, 18, 19, 22, 23, 25, 26, 27] last_lexical_usage_edges_src [] last_lexical_usage_edges_tgt [] computed_from_edges_src [] computed_from_edges_tgt [] returns_to_edges_src [] returns_to_edges_tgt [] guard_by_edges_src [] guard_by_edges_tgt [] guard_by_negation_edges_src [] guard_by_negation_edges_tgt [] last_write_edges_src [] last_write_edges_tgt [] last_read_edges_src [] last_read_edges_tgt [] """ # parse that sample as AST Tree tree = AstTree.fromTensor( data["ast_types"], data["ast_values"], data["depth"], {"target": data["target_full"]}, ) # 'gold': sample.gold, tree.analyzer = TypeScriptGraphAnalyzer() # use analyzer to compute edges tree.number_nodes() per_type_edges = tree.compute_all_edges() for edge_type, values in per_type_edges.items(): print("edge_type", edge_type) print("\t source nodes:", [v[0] for v in values]) print("\t target nodes:", [v[1] for v in values])