def missing_class_gen(root_class, root_test, java_src, log, pit=None, name='tmp', pit2=None): # full path for the test and the .class files scanner_class = pt.walk(root_class, ".class") scanner_java = pt.walk(java_src, ".java") scanner_tests = pt.walk(root_test, "ESTest.java") print "classes size ={}".format(len(scanner_class)) print "tests size ={}".format(len(scanner_tests)) # convert the full path to package format scanner_class_pak = [ pt.path_to_package('org', x, -6) for x in scanner_class ] scanner_tests_pak = [ pt.path_to_package('org', y, -12) for y in scanner_tests ] d = dict_diff(list_one=scanner_class_pak, list_two=scanner_tests_pak, path_root_test=root_test) look_at_test(scanner_java, scanner_tests, d) if pit is not None: miss_PIT(pit, d) if pit2 is not None: miss_target_pit(pit2, d) dff = make_df(d, log, name) return d
def add_loc_helper(row, repo, out, prefix_name='org'): ''' getting the loc info to LOC dir :param repo: path to repo :param out: path where to write the csv ''' commit_buggy = row['parent'] commit_buggy = row['commit'] bug_id = row['issue'] path_to_faulty = row['component_path'] package_name = row['package'] # if str(bug_id ) == '1261': # print "" print path_to_faulty checkout_version(commit_buggy, repo, None) pack = '/'.join(str(path_to_faulty).split('\\')[:-1]) # #TODO: remove it # pack = str(pack).split('/') # indx =pack.index(prefix_name) # pack = '/'.join(pack[:indx+1]) # ##### klasses = pt.walk_rec('{}/{}'.format(repo, pack), [], '.java') d_l = [] for class_i in klasses: name = pt.path_to_package(prefix_name, class_i, -5) size = get_LOC(class_i) d_l.append({'name': name, 'LOC': size, 'bug_name': bug_id}) df = pd.DataFrame(d_l) df.to_csv('{}/{}_LOC.csv'.format(out, bug_id))
def get_miss_classes_applyer(row, out_dir, repo_path): ''' Go over each bug commit and get the list of classes ''' commit_bug = row['parent'] commit_fix = row['commit'] bug_tag = row['tag_parent'] issue_id = row['issue'] index_bug = row['index_bug'] #checkout the buugy version git_cmd = 'git checkout {}'.format(commit_bug) print ge.run_GIT_command_and_log(repo_path, git_cmd, None, None, False) # get classes from src d_l = [] res = pt.walk_rec('{}/src'.format(repo_path), [], '.java') for item_java in res: class_name = pt.path_to_package('org', item_java, -5) d_l.append({ 'class_path': item_java, 'name': class_name, 'tag_bug': bug_tag, 'commit_bug': commit_bug }) df = pd.DataFrame(d_l) df.to_csv('{}/{}_{}.csv'.format(out_dir, issue_id, index_bug))
def path_to_package_name(p_name, path_input): item = str(path_input).replace('\\', '/') start_package = 'org' if p_name == 'opennlp': start_package = 'opennlp' if item[-5:] != '.java': return None try: pack = pt.path_to_package(start_package, item, -1 * len('.java')) except Exception as e: pack = None return pack
def look_at_test(classes, tests, d): '''some info''' d_FP = get_FP_probability() for entry in d.keys(): d[entry]['loc_TEST'] = 0 d[entry]['loc_class'] = 0 for item in classes: ky = pt.path_to_package('org', item, -5) if str(ky).__contains__('package-info'): continue if ky in d_FP: d[ky]['FP'] = d_FP[ky] loc_class = get_LOC(p=item) if ky in d: d[ky]['loc_class'] = int(loc_class[0]) else: print("the .java in not in the dict .class --> {}".format(ky)) continue for cut in tests: ky = pt.path_to_package('org', cut, -12) loc_class = get_LOC(p=cut) if ky in d: num_line = int(loc_class[0]) d[ky]['loc_TEST'] = num_line if num_line == 12: d[ky]['Empty_test_case'] = 1 d[ky]['no_test'] = 0 elif num_line > 12: d[ky]['no_test'] = 0 d[ky]['Empty_test_case'] = 0 elif num_line < 12: d[ky]['no_test'] = 1 d[ky]['Empty_test_case'] = 1 else: print("the .java in not in the dict .class --> {}".format(ky)) continue
def get_results(dir_res='/home/ise/test/pom_3'): res = pt.walk_rec(dir_res, [], 'TIKA', False, lv=-1) d_l = [] d_l_empty = [] for item in res: print "----{}----".format(str(item).split('/')[-1]) name = str(item).split('/')[-1] d_test = {} id_bug = str(name).split('_')[1] bug_name = str(name).split('_')[0] folder_log_evo = pt.walk_rec(item, [], 'log_evo', False) folder_org = pt.walk_rec(item, [], 'org', False) res_log_test = pt.walk_rec(folder_log_evo[0], [], '.txt') for log_t in res_log_test: name = str(log_t).split('/')[-1][:-4] if name not in d_test: d_test[name] = { 'id': id_bug, 'bug_name': bug_name, 'log': 1, 'name': name, 'test': 0 } else: msg = '[Error] duplication in the test log dir := {}'.format( folder_log_evo) raise Exception(msg) if len(folder_org) > 0: res_test = pt.walk_rec(folder_org[0], [], 'ESTest.java') for test_i in res_test: test_name_package = pt.path_to_package('org', test_i, -5) test_name_package = test_name_package[:-7] if test_name_package not in d_test: d_test[test_name_package] = { 'id': id_bug, 'bug_name': bug_name, 'log': 0, 'name': test_name_package, 'test': 1 } else: d_test[test_name_package]['test'] = 1 d_l_empty.extend(d_test.values()) df = pd.DataFrame(d_l_empty) father = '/'.join(str(dir_res).split('/')[:-1]) df.to_csv("{}/result_info_empty.csv".format(father))
def csv_commit_db(csv_db, repo, out_dir_path, is_max=True, is_test=True, only_java=True): df = pd.read_csv(csv_db, names=['component_path', 'commit', 'issue', 'LOC_change']) df['is_java_file'] = df['component_path'].apply( lambda x: str(x).split('.')[-1]) df['is_java'] = np.where(df['is_java_file'] == 'java', 1, 0) df['src/tset'] = df['component_path'].apply( lambda x: 1 if str(x).__contains__(r'src\test') else 0) df['first_name'] = df['component_path'].apply( lambda x: str(x).split('\\')[1]) df['comp_name'] = df['component_path'].apply( lambda x: str(x).split('\\')[-1].split('.')[0]) df['suffix_test'] = df['comp_name'].apply( lambda x: 1 if str(x).endswith('Test') else 0) df['is_test'] = np.where(df['first_name'] == 'test', 1, 0) df.to_csv('{}/tmp_1.csv'.format(out_dir_path)) print "df_size = {}".format(len(df)) if only_java: df = df.loc[df['is_java'] > 0] print "After cleaning the non java component df_size = {}".format( len(df)) if is_test: df = df[df['is_test'] == 0] df = df[df['src/tset'] == 0] print "After cleaning the test component df_size = {}".format(len(df)) if is_max: df = df.groupby('issue').apply(lambda x: x.loc[x['LOC_change'].idxmax( ), ['component_path', 'commit', 'LOC_change', 'is_test']]).reset_index( ) df.to_csv('{}/tmp.csv'.format(out_dir_path)) df['parent'] = df['commit'].apply( lambda x: get_the_previous_commit(x, repo)) df['tag_commit'] = df['commit'].apply( lambda x: ge.get_Tag_name_by_commit(x, repo)) df['date_commit'] = df['commit'].apply( lambda x: get_the_Date_commit(x, repo)) df['tag_parent'] = df['parent'].apply( lambda x: ge.get_Tag_name_by_commit(x, repo)) df['date_commit'] = pd.to_datetime(df['date_commit']) df["module"] = np.nan # make path to package name df['fail_component'] = df['component_path'].apply( lambda x: '.'.join(str(pt.path_to_package('org', x, -5)).split('\\'))) df['package'] = df['fail_component'].apply( lambda x: '.'.join(str(x).split('.')[:-1])) # sorted DF df.sort_values("date_commit", inplace=True) df = df.reset_index(drop=True) df['index_bug'] = df.index repo_name = str(repo).split('/')[-1] # Split to train and test cut_df(df, out_dir_path) # write the whole df to disk df.to_csv('{}/{}_bug.csv'.format(out_dir_path, repo_name))