def extractFunctionData(application_code): """ Extract functions and their meta data """ closure_matched_iter = re.finditer(r""" def\s+ ([\w_]+) # function name \s* \( (.*) # arguments of function \) (.*?) { # start position of the function span """, application_code, re.X) no_closure_matched_iter = re.finditer(r""" def\s+ ([\w_]+) # function name \s* \( (.*) # arguments of function \) (.*?) = # take into account functions that do not have closures """, application_code, re.X) closure_function_list = [] for matched_obj in closure_matched_iter: if not opt.inComment(matched_obj, application_code): function_name = matched_obj.group(1) arg_string = matched_obj.group(2) arg_array = map(str.strip, arg_string.split(",")) arg_with_type_array = [] rdd_type_arg_index = [] for i in range(len(arg_array)): arg = arg_array[i] arg_name, arg_type = map(str.strip, arg.split(":")) rdd_flag = hasRDDType(arg_type) #append arg name with its properties arg_with_type_array.append((arg_name,arg_type,rdd_flag)) if hasRDDType(arg_type): #appends arg positions of RDD type rdd_type_arg_index.append(i) return_type_regex = matched_obj.group(3).strip() returnRDDFlag = hasRDDType(return_type_regex) regex_span = matched_obj.span() function_span = getSpanFromStartPosition(regex_span[1] - 1,application_code) num_args = len(arg_array) closure_function_list.append([function_name, arg_with_type_array, function_span, returnRDDFlag, num_args, rdd_type_arg_index]) function_span_list = [func[2] for func in closure_function_list] effective_span_list = getEffectiveSpan(function_span_list) #reassign function_span to effective_span for i in range(len(effective_span_list)): closure_function_list[i][2] = effective_span_list[i] return closure_function_list
def getRDDsFromLoops(loop, rdd_actions, rdd_functions): """ finds all RDD candidates from loop and returns it as a set """ comments_span_list = opt.findCommentSpans(loop) rdd_set = set() non_arg_matched_iter = re.finditer(r'(\w+?)\.(%s)'%rdd_actions, loop, re.S|re.X|re.M) for matched_obj in non_arg_matched_iter: if not opt.inComment(matched_obj, loop, comments_span_list): rddname = matched_obj.group(1) rdd_set.add(rddname) arg_matched_iter = re.finditer(r'(%s)\(\s*(\w+?)\s*\)'%rdd_actions, loop, re.S|re.X|re.M) for matched_obj in arg_matched_iter: if not opt.inComment(matched_obj, loop, comments_span_list): rddname = matched_obj.group(2) rdd_set.add(rddname) #this is to capture functions defined that are not default RDD functions for rdd_func in rdd_functions: func_name = rdd_func[0] num_args = rdd_func[4] arg_pos_array = rdd_func[5] num_periods = num_args - 1 arg_regex_pattern = "" for i in range(num_args): #adds accordingly number of arg patterns to capture arg_regex_pattern += "\s*(\w+?)\s*" if i < num_args -1 : arg_regex_pattern += "," func_arg_matched_iter = re.finditer(r""" {0}\s*\({1}\) """.format(func_name,arg_regex_pattern) , loop, re.S|re.X|re.M) for matched_obj in func_arg_matched_iter: if not opt.inComment(matched_obj, loop, comments_span_list): #add in the corresponding arguments at their positions for arg_pos in arg_pos_array: rddname = matched_obj.group(arg_pos+1) rdd_set.add(rddname) return rdd_set
def getLoopPatternPosition(loop_patterns, application_code, func_spans): """ Gets the position of loop regex (for/while/do) occurence in code """ loop_keyword_positions = [] for keyword in loop_patterns: matched_iter = re.finditer(keyword, application_code, re.S) for matched_obj in matched_iter: if not opt.inComment(matched_obj, application_code) and opt.inFunctionDecl(matched_obj, application_code, func_spans): loop_keyword_positions += [matched_obj.span()] return loop_keyword_positions
def findReassignedRDD(body, pattern_list, comments_span_list): """ Finds reassigned RDDs in a body of code """ reassigned_candidates = set() matched_iter = re.finditer(r'.*(%s)\s+=\s+\w+' %pattern_list, body, re.S) if matched_iter: for matched_obj in matched_iter: if opt.inComment(matched_obj, body): continue reassigned_candidates.add(matched_obj.group(1)) return reassigned_candidates
def initBeforeLoop(application_code, rdd, end_limit, func_spans, func_rdd_args): """ Finds all the rdd var names in the code """ # Check if the args of the function was one of the candidate for rdd_arg in func_rdd_args: if rdd_arg == rdd: return True span_with_limit = opt.spansWithEndLimit(func_spans, end_limit) search_region = opt.extractSearchRegion(span_with_limit, application_code) comments_span_list = opt.findCommentSpans(search_region) rdd_set = set() matched_iter = re.finditer(r'(val|var)\s*(%s)\s*?='%rdd, search_region, re.S|re.X|re.M) for matched_obj in matched_iter: if not opt.inComment (matched_obj, search_region): rdd_set.add(matched_obj.group()) return len(rdd_set) > 0