def _get_function_signature(function_dictionary): """ gets the signature of a given function """ definition_type = function_dictionary['type'] if definition_type == 'FunctionExpression': wrapper_node = neo4jQueryUtilityModule.getChildsOf( tx, { 'Id': function_dictionary['id'], 'Type': definition_type }, relation_type='params') signature = neo4jQueryUtilityModule.get_code_expression( wrapper_node)[0] parameter = __find_url_parameter(function_dictionary) index = __get_parameter_index_position(signature, parameter) return [signature, parameter, index] elif definition_type == 'FunctionDeclaration': wrapper_node = neo4jQueryUtilityModule.getChildsOf( tx, { 'Id': function_dictionary['id'], 'Type': definition_type }) signature = neo4jQueryUtilityModule.get_code_expression( wrapper_node, short_form=True)[0] parameter = __find_url_parameter(function_dictionary) index = __get_parameter_index_position(signature, parameter) return [signature, parameter, index] else: print(function_dictionary) return ['', '', '']
def get_function_name(tx, function_node): t = function_node['Type'] loc = function_node['Location'] function_node_id = function_node['Id'] if t == 'FunctionDeclaration': query = """ MATCH (n {Id: '%s'})-[:AST_parentOf {RelationType: 'id'}]->(name) WHERE name.Type = 'Identifier' OR name.Type = 'MemberExpression' RETURN name """ % (function_node_id) else: # handle all cases in one go: object expr, assignment expr, var declarator query = """ MATCH (n {Id: '%s'})<-[:AST_parentOf]-(parent)-[:AST_parentOf]->(name) WHERE name.Type = 'Identifier' OR name.Type = 'MemberExpression' RETURN name """ % (function_node_id) results = tx.run(query) for item in results: node = item['name'] if node['Type'] == 'Identifier': name = node['Code'] # return the function name else: tree = neo4jQueryUtilityModule.getChildsOf(tx, node) name = neo4jQueryUtilityModule.get_code_expression(tree)[0] return [name, loc, t] return ['Anonymous', loc, t]
def get_value_of(tx, varname, context_node, calls=[]): out_values = [] node_id = context_node['Id'] arg = str(varname) + '__' + str(node_id) if arg in calls: # return call if same parameters already invoked return out_values if len(calls) > 100: return out_values if DEBUG: print("[+] get_value_of(%s, %s)" % (varname, node_id)) query = """ MATCH (n_s { Id: '%s' })<-[:PDG_parentOf { Arguments: '%s' }]-(n_t) RETURN collect(distinct n_t) AS resultset """ % (node_id, varname) results = tx.run(query) for item in results: current_nodes = item['resultset'] for iterator_node in current_nodes: tree = neo4jQueryUtilityModule.getChildsOf(tx, iterator_node) contextNode = tree['node'] if contextNode['Id'] == constantsModule.PROGRAM_NODE_INDEX: continue ex = neo4jQueryUtilityModule.get_code_expression(tree) #loc = iterator_node['Location'] [code_expr, literals, idents] = ex out_values.append([code_expr, literals, idents]) new_varnames = utilityModule.get_unique_list(list(idents)) # main recursion flow for new_varname in new_varnames: if new_varname == varname or new_varname in constantsModule.JS_DEFINED_VARS: continue call_arg = str(new_varname) + '__' + str(contextNode['Id']) calls.append(call_arg) v = get_value_of(tx, new_varname, contextNode, calls) out_values.extend(v) return out_values
def _get_varname_value_from_context(tx, varname, context_node, PDG_on_variable_declarations_only=False, context_scope=''): """ Description: ------------- function for the data flow analysis @param tx {pointer} neo4j transaction pointer @param {string} varname @param {dict} context_node: node specifying the CFG-level statement where varname is defined @param {bool} PDG_on_variable_declarations_only: internal val to keep state in recursions @param {string} context_scope: internal val to keep context scope in recursions @return {list}: a 2d list where each entry is of the following format [program_slice, literals, dict of identifer mapped to identifer node is, location dict] """ ## ------------------------------------------------------------------------------- ## ## Globals and utility functions ## ------------------------------------------------------------------------------- ## # output out_values = [] # stores a map: funcDef id -->> get_function_call_values_of_function_definitions(funcDef) knowledge_database = {} # context node identifer node_id = context_node['Id'] def _get_all_call_values_of(varname, func_def_node): key = func_def_node['Id'] if key in knowledge_database: knowledge = knowledge_database[key] else: knowledge = get_function_call_values_of_function_definitions( tx, func_def_node) knowledge_database[key] = knowledge ret = {} for nid, values in knowledge.items(): if varname in values: ret[nid] = values[varname] return ret ## ------------------------------------------------------------------------------- ## ## Main logic ## ------------------------------------------------------------------------------- ## if PDG_on_variable_declarations_only: # for VariableDeclaration PDG relations query = """ MATCH (n_s { Id: '%s' })<-[:PDG_parentOf { Arguments: '%s' }]-(n_t {Type: 'VariableDeclaration'}) RETURN collect(distinct n_t) AS resultset """ % (node_id, varname) else: # for all PDG relations query = """ MATCH (n_s { Id: '%s' })<-[:PDG_parentOf { Arguments: '%s' }]-(n_t) RETURN collect(distinct n_t) AS resultset """ % (node_id, varname) results = tx.run(query) for item in results: currentNodes = item['resultset'] for iteratorNode in currentNodes: if iteratorNode['Type'] == 'BlockStatement': # the parameter 'varname' is a function argument func_def_node = get_function_def_of_block_stmt( tx, iteratorNode) # check if func def has a varname parameter if func_def_node[ 'Type'] == 'FunctionExpression' or func_def_node[ 'Type'] == 'FunctionDeclaration': match_signature = check_if_function_has_param( tx, varname, func_def_node) if match_signature: if context_scope == '': out = [ '%s = %s' % (varname, constantsModule.LOCAL_ARGUMENT_TAG_FOR_FUNC), [], [varname], iteratorNode['Location'] ] else: out = [ '%s %s = %s' % (context_scope, varname, constantsModule.LOCAL_ARGUMENT_TAG_FOR_FUNC), [], [varname], iteratorNode['Location'] ] out_values.append(out) varname_values_within_call_expressions = _get_all_call_values_of( varname, func_def_node) for nid in varname_values_within_call_expressions: each_argument = varname_values_within_call_expressions[ nid] location_line = _get_location_part(nid) if each_argument['Type'] == 'Literal': if context_scope == '': out = [ '%s <--(invocation-value)-- \"%s\"' % (varname, each_argument['Value']), [each_argument['Value']], [varname], location_line ] else: out = [ '%s %s <--(invocation-value)-- \"%s\"' % (context_scope, varname, each_argument['Value']), [each_argument['Value']], [varname], location_line ] out_values.append(out) elif each_argument['Type'] == 'Identifier': call_expr_id = _get_node_id_part(nid) # use this as an id to mark variables in this scope when doing def-use analsis context_id_of_call_scope = '[scope-id=%s]' % call_expr_id if context_scope == '': out = [ '%s <--(invocation-value)-- [def-scope-id=%s] %s' % (varname, call_expr_id, each_argument['Value']), [], [varname, each_argument['Value']], location_line ] else: out = [ '%s %s <--(invocation-value)-- [def-scope-id=%s] %s' % (context_scope, varname, call_expr_id, each_argument['Value']), [], [varname, each_argument['Value']], location_line ] out_values.append(out) top_level_of_call_expr = get_non_anonymous_call_expr_top_node( tx, {'Id': call_expr_id}) recurse = _get_varname_value_from_context( tx, each_argument['Value'], top_level_of_call_expr, context_scope=context_id_of_call_scope) out_values.extend(recurse) elif each_argument['Type'] == 'MemberExpression': call_expr_id = _get_node_id_part(nid) context_id_of_call_scope = '[scope-id=%s]' % call_expr_id if context_scope == '': out = [ '%s <--(invocation-value)-- [def-scope-id=%s] %s' % (varname, call_expr_id, each_argument['Value']), [], [varname, each_argument['Value']], location_line ] else: out = [ '%s %s <--(invocation-value)-- [def-scope-id=%s] %s' % (context_scope, varname, call_expr_id, each_argument['Value']), [], [varname, each_argument['Value']], location_line ] out_values.append(out) # PDG on member expressions-> do PDG on the top most parent of it! top_most = each_argument['Value'].split('.')[0] call_expr_id = _get_node_id_part(nid) top_level_of_call_expr = get_non_anonymous_call_expr_top_node( tx, {'Id': call_expr_id}) recurse = _get_varname_value_from_context( tx, top_most, top_level_of_call_expr, context_scope=context_id_of_call_scope) out_values.extend(recurse) elif each_argument['Type'] == 'ObjectExpression': call_expr_id = _get_node_id_part(nid) context_id_of_call_scope = '[scope-id=%s]' % call_expr_id if context_scope == '': out = [ '%s <--(invocation-value)-- [def-scope-id=%s] %s' % (varname, call_expr_id, each_argument['Value']), [], [varname, each_argument['Value']], location_line ] else: out = [ '%s %s <--(invocation-value)-- [def-scope-id=%s] %s' % (context_scope, varname, call_expr_id, each_argument['Value']), [], [varname, each_argument['Value']], location_line ] out_values.append(out) additional_identifiers = each_argument[ 'ResolveIdentifiers'] if additional_identifiers is not None: for each_additional_identifier in additional_identifiers: top_level_of_call_expr = get_non_anonymous_call_expr_top_node( tx, {'Id': call_expr_id}) recurse = _get_varname_value_from_context( tx, each_additional_identifier, top_level_of_call_expr, context_scope= context_id_of_call_scope) out_values.extend(recurse) else: # expression statements, call expressions (window.location.replace(), etc) if context_scope == '': out = [ '%s <--(invocation-value)-- %s' % (varname, each_argument['Value']), [], [varname, each_argument['Value']], location_line ] else: out = [ '%s %s <--(invocation-value)-- %s' % (context_scope, varname, each_argument['Value']), [], [varname, each_argument['Value']], location_line ] out_values.append(out) ## ThisExpression Pointer Analysis ## NOTE: this code block must be executed for ALL branches, so we have to place it outside of all conditional branches additional_identifiers = each_argument[ 'ResolveIdentifiers'] if additional_identifiers is not None: if 'ThisExpression' in additional_identifiers: this_expression_node_id = additional_identifiers[ 'ThisExpression'] pointer_resolutions = get_this_pointer_resolution( tx, {'Id': this_expression_node_id}) for item in pointer_resolutions['methods']: owner_item = item['owner'] owner_top = item['top'] tree_owner = QU.getChildsOf( tx, owner_item) tree_owner_exp = QU.get_code_expression( tree_owner)[0] location_line = owner_item['Location'] out_line = '%s this --(points-to)--> %s [this-nid: %s]' % ( context_scope, tree_owner_exp, this_expression_node_id) out = [ out_line.lstrip(), [], [tree_owner_exp[0]], location_line ] out_values.append(out) # def-use analysis over resolved `this` pointer if owner_item != '' and owner_item is not None and owner_item != constantsModule.WINDOW_GLOBAL_OBJECT and owner_item[ 'Type'] == 'Identifier': recurse_values = _get_varname_value_from_context( tx, tree_owner_exp, owner_top, PDG_on_variable_declarations_only =True) out_values.extend(recurse_values) # handle `this` that resolves to DOM elements in events for element in pointer_resolutions[ 'events']: if 'relation' in element: # fetched via analysis item = element['relation'] target_node_id = item[ 'Arguments'].split('___')[1] if target_node_id == 'xx': continue else: tree_owner = QU.getChildsOf( {'Id': target_node_id}) tree_owner_exp = QU.get_code_expression( tree_owner) location_line = tree_owner[ 'Location'] out_line = '%s this --(points-to)--> %s [this-nid: %s]' % ( context_scope, tree_owner_exp, this_expression_node_id) out = [ out_line.lstrip(), [], [tree_owner_exp], location_line ] out_values.append(out) else: # fetched from DB item = element['owner'] target_node_id = item['Id'] tree_owner = QU.getChildsOf( {'Id': target_node_id}) tree_owner_exp = QU.get_code_expression( tree_owner) location_line = tree_owner[ 'Location'] out_line = '%s this --(points-to)--> %s [this-nid: %s]' % ( context_scope, tree_owner_exp, this_expression_node_id) out = [ out_line.lstrip(), [], [tree_owner_exp], location_line ] out_values.append(out) continue tree = QU.getChildsOf(tx, iteratorNode) contextNode = tree['node'] if contextNode['Id'] == constantsModule.PROGRAM_NODE_INDEX: continue ex = QU.get_code_expression(tree) loc = iteratorNode['Location'] [code_expr, literals, idents] = ex if context_scope != '': code_expr = context_scope + ' ' + code_expr out_values.append([code_expr, literals, idents, loc]) new_varnames = list(set((list(idents)))) # get unique vars # handle `this` expressions if 'ThisExpression' in new_varnames: this_expression_node_id = idents['ThisExpression'] pointer_resolutions = get_this_pointer_resolution( tx, {'Id': this_expression_node_id}) for item in pointer_resolutions['methods']: owner_item = item['owner'] owner_top = item['top'] tree_owner = QU.getChildsOf(tx, owner_item) tree_owner_exp = QU.get_code_expression(tree_owner)[0] location_line = owner_item['Location'] out_line = '%s this --(points-to)--> %s [this-nid: %s]' % ( context_scope, tree_owner_exp, this_expression_node_id) out = [ out_line.lstrip(), [], [tree_owner_exp[0]], location_line ] out_values.append(out) # def-use analysis over resolved `this` pointer if owner_item != '' and owner_item is not None and owner_item != constantsModule.WINDOW_GLOBAL_OBJECT and owner_item[ 'Type'] == 'Identifier': recurse_values = _get_varname_value_from_context( tx, tree_owner_exp, owner_top, PDG_on_variable_declarations_only=True) out_values.extend(recurse_values) # handle `this` that resolves to DOM elements in events for element in pointer_resolutions['events']: if 'relation' in element: # fetched via analysis item = element['relation'] target_node_id = item['Arguments'].split('___')[1] if target_node_id == 'xx': continue else: tree_owner = QU.getChildsOf({'Id': target_node_id}) tree_owner_exp = QU.get_code_expression(tree_owner) location_line = tree_owner['Location'] out_line = '%s this --(points-to)--> %s [this-nid: %s]' % ( context_scope, tree_owner_exp, this_expression_node_id) out = [ out_line.lstrip(), [], [tree_owner_exp], location_line ] out_values.append(out) else: # fetched from DB item = element['owner'] target_node_id = item['Id'] tree_owner = QU.getChildsOf({'Id': target_node_id}) tree_owner_exp = QU.get_code_expression(tree_owner) location_line = tree_owner['Location'] out_line = '%s this --(points-to)--> %s [this-nid: %s]' % ( context_scope, tree_owner_exp, this_expression_node_id) out = [ out_line.lstrip(), [], [tree_owner_exp], location_line ] out_values.append(out) # main recursion flow for new_varname in new_varnames: if new_varname == varname or new_varname in constantsModule.JS_DEFINED_VARS: continue # check if new_varname is a function call # i.e., it has a `callee` relation to a parent of type `CallExpression` new_varname_id = idents[new_varname] check_function_call_query = """ MATCH (n { Id: '%s' })<-[:AST_parentOf {RelationType: 'callee'}]-(fn_call {Type: 'CallExpression'})-[:CG_parentOf]->(call_definition) RETURN call_definition """ % (new_varname_id) call_definition_result = tx.run(check_function_call_query) is_func_call = False for definition in call_definition_result: item = definition['call_definition'] if item is not None: is_func_call = True wrapper_node_function_definition = QU.getChildsOf( tx, item) ce_function_definition = QU.get_code_expression( wrapper_node_function_definition) location_function_definition = item['Location'] body = ce_function_definition[0] body = jsbeautifier.beautify(body) out_line = """%s %s\n\t\t\t %s""" % ( context_scope, constantsModule.FUNCTION_CALL_DEFINITION_BODY, body) out = [ out_line.strip(), [], [], location_function_definition ] if out not in out_values: # avoid returning/printing twice out_values.append(out) if is_func_call: continue v = _get_varname_value_from_context( tx, new_varname, contextNode, context_scope=context_scope) out_values.extend(v) return out_values
def get_function_call_values_of_function_definitions(tx, function_def_node): """ Description: ------------- navigates the call graph to find the bindings between 'function-call arguments' & 'function definition params' @param {pointer} tx: neo4j transaction pointer @param {node} function_def_node: a 'FunctionExpression' or 'FunctionDeclaration' node of esprima AST @return {dictionary} { call_line: {p1: val1, p2:val2}, call_line: {p1: val1, p2: val2}, ... } """ out = {} query = """ MATCH (param)<-[:AST_parentOf {RelationType: 'params'}]-(functionDef { Id: '%s' })<-[:CG_parentOf]-(caller {Type: 'CallExpression'})-[:AST_parentOf {RelationType: 'arguments'}]-> (arg) RETURN collect(distinct param) as params, caller, collect(distinct arg) AS args """ % (function_def_node['Id']) results = tx.run(query) for each_binding in results: call_expression = each_binding['caller'] args = each_binding['args'] params = each_binding['params'] if len(args) < len(params): params = params[:: -1] # must reverse this list to match in case of call with lower number of arguments than definition call_location_line = call_expression['Location'] call_nid = call_expression['Id'] key = call_nid + '__Loc=' + call_location_line out[key] = {} for i in range(len(params)): if i <= len( args ) - 1: # handle the case the function is called with lesser arguments than its definition [param, param_type] = get_value_of_identifer_or_literal(params[i]) argument_type = args[i]['Type'] if argument_type == 'MemberExpression': tree = QU.getChildsOf(tx, args[i]) ce = QU.get_code_expression(tree) identifiers = ce[2] arg = ce[0] arg_type = 'MemberExpression' elif argument_type == 'ObjectExpression': tree = QU.getChildsOf(tx, args[i]) ce = QU.get_code_expression(tree) identifiers = ce[2] arg = ce[0] arg_type = 'ObjectExpression' elif argument_type == 'Literal' or argument_type == 'Identifier': [arg, arg_type] = get_value_of_identifer_or_literal(args[i]) identifiers = None else: tree = QU.getChildsOf(tx, args[i]) ce = QU.get_code_expression(tree) identifiers = ce[2] arg = ce[0] arg_type = argument_type out[key][param] = { 'Value': arg, 'Type': arg_type, 'ResolveIdentifiers': identifiers } return out
def inout_relationship(tx): out_dep = {} out_control = {} function_names = {} functions = find_function_expressions(tx) for each_func_item in functions: each_func = each_func_item[0] each_func_params = each_func_item[1] fn_id = each_func['Id'] out_dep[fn_id] = [] out_control[fn_id] = [] if DEBUG: print("-" * 10) print("[+] inout_relationship -> function_id: %s" % fn_id) print("[+] inout_relationship -> function_params: %s" % str(each_func_params)) return_statements = get_return_statements(tx, fn_id) for return_statement in return_statements: tree = neo4jQueryUtilityModule.getChildsOf(tx, return_statement) ex = neo4jQueryUtilityModule.get_code_expression(tree) [code_expr, literals, idents] = ex code_expr = code_expr.strip() if len( code_expr ) == 0 or code_expr == '\"true\"' or code_expr == '\"false\"': continue if DEBUG: print("[+] inout_relationship -> return: %s" % code_expr) cache = {} for p in each_func_params: if p in out_dep[fn_id]: continue else: done = False for ident in idents: if ident in constantsModule.JS_DEFINED_VARS or ident == 'this' or ident == 'ThisExpression': continue if DEBUG: print("[+] inout_relationship -> tracking: %s" % ident) if ident not in cache: slices = get_value_of(tx, ident.strip(), return_statement) cache[ident] = slices else: slices = cache[ident] for each_slice in slices: slice_code = each_slice[0] if p in slice_code: if fn_id not in function_names: func_name = get_function_name( tx, each_func) function_names[fn_id] = func_name + [ each_func_params ] out_dep[fn_id].append(p) done = True break if done: break control_statements = get_control_predicates(tx, fn_id) for control_statement in control_statements: tree = neo4jQueryUtilityModule.getChildsOf(tx, control_statement) ex = neo4jQueryUtilityModule.get_code_expression(tree) [code_expr, literals, idents] = ex code_expr = code_expr.strip() if len( code_expr ) == 0 or code_expr == '\"true\"' or code_expr == '\"false\"': continue if DEBUG: print("[+] inout_relationship -> control: %s" % code_expr) for p in each_func_params: if p in out_control[fn_id]: continue else: for ident in idents: if ident in constantsModule.JS_DEFINED_VARS: continue if p.strip() == ident.strip(): if fn_id not in function_names: func_name = get_function_name(tx, each_func) function_names[fn_id] = func_name + [ each_func_params ] out_control[fn_id].append(p) break # if DEBUG: # print("[+] inout_relationship -> dependency:\n%s\n"%str(out_dep)) # print("[+] inout_relationship -> control:\n%s\n"%str(out_control)) # print("[+] inout_relationship -> function names:%s\n"%str(function_names)) return [out_dep, out_control, function_names]