def compare_functions(r_func_name, pd_func_name): r_func = r_functions.get_r_function(r_source_file, r_func_name) pd_func = pd_functions.get_pd_functions(pd_source_file, as_dict=True)[pd_func_name] generated_args = get_generated_args(r_func, 100) r_results = r_functions.execute_R_function_on_args(r_func, generated_args) r_executed = { "name": r_func_name, "body": r_functions.get_function_body(r_func), "inputKey": "DF-KEY", "outputs": r_results } pd_results = pd_functions.execute_pd_function_on_args( pd_func, generated_args) pd_executed = { "name": pd_func_name, "body": helper.get_func_body(pd_func), "inputKey": "DF-KEY", "outputs": pd_results } r_func = Function(name=r_func_name, input_key=r_executed["inputKey"], outputs=Outputs(r_executed["outputs"]), body=test_clustering.get_body(r_executed), source=r_source_file) pd_func = Function(name=pd_func_name, input_key=pd_executed["inputKey"], outputs=Outputs(pd_executed["outputs"]), body=test_clustering.get_body(pd_executed), source=pd_source_file) print(test_clustering.execution_similarity(r_func, pd_func))
def load_functions(dataset, is_test=False, update_clone_meta=False): LOGGER.info("Loading java functions for '%s' ... " % dataset) data_store = get_store(dataset, is_test=is_test) functions_arr = data_store.load_functions() function_pattern = re.compile(r'^func_') functions = [] for func_dict in functions_arr: if not function_pattern.match(func_dict['name']): continue function_metadata = data_store.load_metadata(func_dict) if not function_metadata or not function_metadata.get("return", None): continue return_meta_data = function_metadata["return"] outputs = Outputs(func_dict["outputs"]) funct = Function(name=func_dict["name"], dataset=dataset, class_name=func_dict["class"], package=func_dict["package"], input_key=func_dict["inputKey"], outputs=outputs, lines_touched=function_metadata.get( "linesTouched", None), span=function_metadata.get("span", None), body=function_metadata["body"], source="java") if data_store.is_object_return(return_meta_data): cloned_function_names = get_execution_store( dataset).load_cloned_function_names(funct.name) updated_cloned_function_names = {} for attribute, returns in data_store.get_return_vals( outputs.returns).items(): clone = funct.clone() clone.outputs = outputs.clone() clone.outputs.returns = returns[:] clone.return_attribute = attribute if cloned_function_names and attribute in cloned_function_names: clone.name = cloned_function_names[attribute] updated_cloned_function_names[attribute] = clone.name functions.append(clone) if update_clone_meta: get_execution_store(dataset).save_cloned_function_names( funct.name, updated_cloned_function_names) else: functions.append(funct) if is_test: return functions valid_functions = [func for func in functions if func.is_useful()] LOGGER.info("Valid Functions : %d / %d" % (len(valid_functions), len(functions))) return valid_functions
def load_py_functions(dataset, is_test=False): LOGGER.info("Loading python functions for '%s' ... " % dataset) data_store = get_store(dataset, is_test=is_test) functions_arr = data_store.load_py_functions() function_pattern = re.compile(r'^func_') functions = [] for func_dict in functions_arr: if not function_pattern.match(func_dict['name']): continue function_metadata = data_store.load_py_metadata(func_dict['name']) outputs = Outputs(func_dict["outputs"]) funct = Function(name=func_dict["name"], dataset=dataset, input_key=func_dict["inputKey"], outputs=outputs, lines_touched=function_metadata.get( "linesTouched", None), span=function_metadata.get("span", None), body=function_metadata["body"], source="python") functions.append(funct) if is_test: return functions valid_functions = [func for func in functions if func.is_useful()] LOGGER.info("Valid Functions : %d / %d" % (len(valid_functions), len(functions))) return valid_functions
def similarity(r_func_name, py_func_name): r_func = r_functions.get_r_functions(R_FUNCTIONS_SOURCE_FILE)[r_func_name] pd_func = pd_functions.get_pd_functions(PD_FUNCTIONS_SOURCE_FILE, as_dict=True)[py_func_name] r_executed = process_R_function(R_FUNCTIONS_SOURCE_FILE, r_func_name, r_func) r_func = Function(name=r_func_name, input_key=r_executed["inputKey"], outputs=Outputs(r_executed["outputs"]), body=test_clustering.get_body(r_executed), source=R_FUNCTIONS_SOURCE_FILE) pd_executed = process_pd_function(PD_FUNCTIONS_SOURCE_FILE, pd_func) pd_func = Function(name=py_func_name, input_key=pd_executed["inputKey"], outputs=Outputs(pd_executed["outputs"]), body=test_clustering.get_body(pd_executed), source=PD_FUNCTIONS_SOURCE_FILE) print(test_clustering.execution_similarity(r_func, pd_func))
def format_outputs(outputs): formatted_outputs = {} for key, output in outputs.items(): formatted = Outputs() formatted.is_all_same = True prev_vals = [] for o in output: ret_formatted = format_return( o["return"]) if "return" in o else None formatted.returns.append(ret_formatted) formatted.errors.append(o["errorMessage"] if "errorMessage" in o else None) formatted.durations.append(o["duration"] if "duration" in o else None) if len(prev_vals) == 0: prev_vals.append(ret_formatted) elif formatted.is_all_same and not is_equal( ret_formatted, prev_vals[0]): formatted.is_all_same = False formatted_outputs[key] = formatted return formatted_outputs
def _test_function(): dataset = "codejam" data_store = get_store("codejam") func_dict = data_store.load_function("func_b1d6e0e04b4f4065870c60fcba28ff0c") function_metadata = data_store.load_metadata(func_dict) outputs = Outputs(func_dict["outputs"]) funct = Function(name=func_dict["name"], dataset=dataset, class_name=func_dict["class"], package=func_dict["package"], input_key=func_dict["inputKey"], outputs=outputs, lines_touched=function_metadata.get("linesTouched", None), span=function_metadata.get("span", None), body=function_metadata["body"], source="java") print func_dict print(funct.is_useful())
def load_functions(functions_path, source): functions_dict = cache.load_pickle(functions_path) functions = [] for func_name, func_dict in functions_dict.items(): outputs = Outputs(func_dict["outputs"]) funct = Function(name=func_name, input_key=func_dict["inputKey"], outputs=outputs, body=get_body(func_dict), source=source) functions.append(funct) valid_functions = { funct.name: funct for funct in functions if is_useful_function(funct) } LOGGER.info("Valid Functions : %d / %d" % (len(valid_functions), len(functions))) return valid_functions