def create_corpus(dir, name, checkstyle_dir): if dir.endswith('/'): dir = dir[:-1] corpus_dir = f'./styler/{name}-corpus' (output, returncode) = checkstyle.check(checkstyle_file_path=checkstyle_dir, file_path=dir) files_without_errors = get_files_without_errors(output) files_with_errors = get_files_with_errors(output) print(f'Found {len(files_without_errors)} files with no errors.') print(f'Found {len(files_with_errors)} files with errors.') def is_good_candidate(file_path): if not file_path.endswith('.java'): return False return True candidate_files = filter(is_good_candidate, files_without_errors) create_dir(corpus_dir) shutil.copy(checkstyle_dir, os.path.join(corpus_dir, 'checkstyle.xml')) for id, file in tqdm(enumerate(candidate_files), desc='Copy'): file_target_dir = os.path.join(corpus_dir, f'data/{id}') file_name = file.split('/')[-1] file_target = os.path.join(file_target_dir, file_name) create_dir(file_target_dir) shutil.copy(file, file_target) corpus_info = { 'grammar': 'Java8', 'indent': '4' } save_json(corpus_dir, 'corpus.json', corpus_info) return corpus_dir
def gen(self, max_time): create_dir(self.batch_dir) self.batch_injections = {} for index, file_dir in tqdm(enumerate(self.batch_files), total=self.batch_size): if datetime.now() >= max_time: logger.debug('Time out.') break file_name = file_dir.split('/')[-1] original_source = open_file(file_dir) try: modified_source, modification = modify_source(original_source, protocol=self.protocol) modification_folder = os.path.join(self.batch_dir, str(index)) create_dir(modification_folder) modified_file_dir = save_file(modification_folder, file_name, modified_source) diff_str = diff(file_dir, modified_file_dir) diff_path = save_file(modification_folder, 'diff.diff', diff_str) self.batch_injections[index] = { 'modification': modification, 'diff': diff_str, 'dir': modification_folder, 'orig': file_dir, 'file_name': file_name } except InsertionException: logger.debug(InsertionException) continue except Exception as err: logger.debug(err) continue self.checkstyle_result, _ = checkstyle.check( self.checkstyle_file_path, self.batch_dir, self.checkstyle_jar, only_targeted=True ) if self.checkstyle_result is not None: for file_dir, res in self.checkstyle_result.items(): index = int(file_dir.split('/')[-2]) if index not in self.batch_injections: continue self.batch_injections[index]['violations'] = res['violations'] save_json(self.batch_injections[index]['dir'], 'violations.json', res['violations']) self.batch_information = { 'batch_id': self.batch_id, 'injection_report': self.batch_injections } save_json(self.batch_dir, 'metadata.json', self.batch_information) return self.batch_information return None
def find_errored_files(repo, commit, use_maven=False, checkstyle_path='checkstyle.xml'): # print(f'{repo}/{commit}') dir = repo.working_dir # clean_up checkstyle_result_xml = find_all(dir, 'checkstyle-result.xml') for file in checkstyle_result_xml: os.remove(file) if use_maven: pom = find_the_pom(dir) cmd = f'mvn -f {pom} clean checkstyle:checkstyle' process = subprocess.Popen(cmd.split(" "), stdout=subprocess.PIPE) output = process.communicate()[0] checkstyle_results = [ checkstyle.parse_res(open_file(file)) for file in find_all(dir, 'checkstyle-result.xml') ] else: # checkstyle_relative_dir = find_all(dir, checkstyle_path) checkstyle_dir = os.path.join(dir, checkstyle_path) if not os.path.isfile(checkstyle_dir): checkstyle_results = [] else: output, returncode = checkstyle.check( checkstyle_file_path=checkstyle_dir, file_path=dir) checkstyle_results = [output] reports_with_errors = check_checkstyle_results(checkstyle_results) repo_name = dir.split('/')[-2] + "-" + dir.split('/')[-1] count = 0 target = get_real_errors_commit_dir(repo_name, commit) for report_dir, results in reports_with_errors.items(): # checkstyle_checker = f'{"/".join(report_dir.split("/")[:-1])}/checkstyle-checker.xml' for file, errors in results.items(): print(f'{file} has {len(errors)} error(s)') file_name = file.split('/')[-1] dir = os.path.join(target, str(count)) create_dir(dir) shutil.copyfile(file, os.path.join(dir, file_name)) # shutil.copyfile(checkstyle_checker, os.path.join(dir, 'checkstyle.xml')) save_json(dir, 'errors.json', errors) count += 1 print("# Files with at least one error: %s" % count) find_all(dir, 'checkstyle-checker.xml') return output
def repair_files(dir, dir_files, model_name, only_formatting=False): # set the dirs target = os.path.join(dir, 'repair-attempt') target_final = os.path.join(dir, 'files-repaired') checkstyle_rules = os.path.join(dir_files, 'checkstyle.xml') waste = os.path.join(dir, 'waste') # yet we focus on single error files # TODO : Improve it dir_files = os.path.join(dir_files, f'./1') # create the folders create_dir(target) create_dir(waste) # Init of the translator translate = gen_translator(model_name, batch_size=5, only_formatting=only_formatting) list_of_fileids = list_folders(dir_files) number_of_files = len(list_of_fileids) #list_of_fileids = [] for folder_id in tqdm(list_of_fileids): file_path = glob.glob(f'{dir_files}/{folder_id}/*.java')[0] metadata_path = f'{dir_files}/{folder_id}/metadata.json' for error_id, error in enumerate(tokenize_errors(file_path, open_json(metadata_path)['errors'])): tokenized_errors, info = error for proposal_id, translation in enumerate(translate(tokenized_errors)): de_tokenized_translation = de_tokenize(file_path, info, translation, only_formatting=only_formatting) folder = f'{target}/batch_{proposal_id}/{int(folder_id) + error_id * number_of_files}' create_dir(folder) save_file(folder, file_path.split('/')[-1], de_tokenized_translation) move_parse_exception_files(target, waste) checkstyle_result, number_of_errors = checkstyle.check(checkstyle_rules, target, only_targeted=True) #json_pp(checkstyle_result) #save_json('./', 'test.json', checkstyle_result) files_properly_repaired = reverse_collection(get_batch_results(checkstyle_result)) #print(files_properly_repaired) final_repairs = { id:select_the_best_repair( [ glob.glob(f'{target}/batch_{batch}/{id}/*.java')[0] for batch in repairs ], glob.glob(f'{dir_files}/{int(id) % number_of_files}/*.java')[0] ) for id, repairs in files_properly_repaired.items() } json_pp(final_repairs) for id, path in final_repairs.items(): folder = create_dir(f'{target_final}/{id}') shutil.copy(path, folder)
def get_checkstyle_results(tool, dir, only_targeted=False, checkstyle_rules=None): """ Return the checkstyle results """ tool_dir = os.path.join(dir, tool) file_name = f'checkstyle_results_{tool}.json' result_file_dir = f'{dir}/{file_name}' results_json = {} if os.path.exists(result_file_dir): results_json = open_json(result_file_dir) else: if checkstyle_rules is None: checkstyle_rules = os.path.join(dir, 'checkstyle.xml') checkstyle_results, number_of_errors = checkstyle.check( checkstyle_rules, tool_dir, only_targeted=only_targeted) results_json['checkstyle_results'] = checkstyle_results results_json['number_of_errors'] = number_of_errors save_json(dir, file_name, results_json) return results_json['checkstyle_results'], results_json['number_of_errors']
def gen_errored(corpus, get_random_corpus_file, repo_name, goal, id, target_dir): DEBUG = False folder = os.path.join(target_dir, f'./{goal}/{id}') file = get_random_corpus_file(goal) file_dir = file[2] file_name = file[0].split('.')[0] done = False error = None ugly_file = "" max_attepts = 10 attepts = 0 while not done: if attepts >= max_attepts: # it is ugly but it i made in order to avoid the loop to get stuck file = get_random_corpus_file(goal) file_dir = file[2] file_name = file[0].split('.')[0] attepts = 0 continue if os.path.exists(folder): shutil.rmtree(folder) create_dir(folder) injection_operator = random.choice( list(injection_operator_types.keys())) ugly_file = os.path.join(folder, f'./{file_name}.java') modification = jlu.gen_ugly( file_dir, folder, modification_number=injection_operator_types[injection_operator]) if DEBUG: print(modification) if not jlu.check_well_formed(ugly_file): if DEBUG: print('Not well formed') attepts = attepts + 1 continue try: cs_result, number_of_errors = checkstyle.check( corpus.checkstyle, ugly_file) except: if DEBUG: print('Cant run checkstule') attepts = attepts + 1 continue if number_of_errors != 1: if DEBUG: print(f'{number_of_errors} errors') attepts = attepts + 1 continue spaces_original, tokens_original = jlu.tokenize_with_white_space( open_file(file_dir)) spaces_errored, tokens_errored = jlu.tokenize_with_white_space( open_file(ugly_file)) if len(tokens_original) != len(tokens_errored): if DEBUG: print( f'Not the same length : orig {len(tokens_original)} vs {len(tokens_errored)}' ) attepts = attepts + 1 continue error = list(cs_result.values())[0]['errors'][0] done = True original_file = os.path.join(folder, f'./{file_name}-orig.java') if file_dir != original_file: shutil.copyfile(file_dir, original_file) save_file(folder, 'diff.diff', run_diff(original_file, ugly_file)) report = {} report['injection_operator'] = injection_operator report['line'] = error['line'] if 'column' in error: report['column'] = error['column'] report['message'] = error['message'] report['type'] = error['source'].split('.')[-1][:-5] save_json(folder, 'metadata.json', report)