Ejemplo n.º 1
0
def create_corpus(dir, name, checkstyle_dir):
    if dir.endswith('/'):
        dir = dir[:-1]
    corpus_dir = f'./styler/{name}-corpus'

    (output, returncode) = checkstyle.check(checkstyle_file_path=checkstyle_dir, file_path=dir)

    files_without_errors = get_files_without_errors(output)
    files_with_errors = get_files_with_errors(output)

    print(f'Found {len(files_without_errors)} files with no errors.')
    print(f'Found {len(files_with_errors)} files with errors.')

    def is_good_candidate(file_path):
        if not file_path.endswith('.java'):
            return False
        return True

    candidate_files = filter(is_good_candidate, files_without_errors)

    create_dir(corpus_dir)
    shutil.copy(checkstyle_dir, os.path.join(corpus_dir, 'checkstyle.xml'))
    for id, file in tqdm(enumerate(candidate_files), desc='Copy'):
        file_target_dir = os.path.join(corpus_dir, f'data/{id}')
        file_name = file.split('/')[-1]
        file_target = os.path.join(file_target_dir, file_name)
        create_dir(file_target_dir)
        shutil.copy(file, file_target)

    corpus_info = {
        'grammar': 'Java8',
        'indent': '4'
    }
    save_json(corpus_dir, 'corpus.json', corpus_info)
    return corpus_dir
Ejemplo n.º 2
0
    def gen(self, max_time):
        create_dir(self.batch_dir)
        self.batch_injections = {}
        for index, file_dir in tqdm(enumerate(self.batch_files), total=self.batch_size):
            if datetime.now() >= max_time:
                logger.debug('Time out.')
                break
            
            file_name = file_dir.split('/')[-1]
            original_source = open_file(file_dir)
            try:
                modified_source, modification = modify_source(original_source, protocol=self.protocol)    
                modification_folder = os.path.join(self.batch_dir, str(index))
                create_dir(modification_folder)
                modified_file_dir = save_file(modification_folder, file_name, modified_source)
                
                diff_str = diff(file_dir, modified_file_dir)
                diff_path = save_file(modification_folder, 'diff.diff', diff_str)
                self.batch_injections[index] = {
                    'modification': modification,
                    'diff': diff_str,
                    'dir': modification_folder,
                    'orig': file_dir,
                    'file_name': file_name
                }
            except InsertionException:
                logger.debug(InsertionException)
                continue
            except Exception as err:
                logger.debug(err)
                continue

        self.checkstyle_result, _ = checkstyle.check(
            self.checkstyle_file_path,
            self.batch_dir,
            self.checkstyle_jar,
            only_targeted=True
        )
        if self.checkstyle_result is not None:
            for file_dir, res in self.checkstyle_result.items():
                index = int(file_dir.split('/')[-2])
                if index not in self.batch_injections:
                    continue
                self.batch_injections[index]['violations'] = res['violations']
                save_json(self.batch_injections[index]['dir'], 'violations.json', res['violations'])

            self.batch_information = {
                'batch_id': self.batch_id,
                'injection_report': self.batch_injections
            }
            save_json(self.batch_dir, 'metadata.json', self.batch_information)
            return self.batch_information
        return None
Ejemplo n.º 3
0
def find_errored_files(repo,
                       commit,
                       use_maven=False,
                       checkstyle_path='checkstyle.xml'):
    # print(f'{repo}/{commit}')
    dir = repo.working_dir

    # clean_up
    checkstyle_result_xml = find_all(dir, 'checkstyle-result.xml')
    for file in checkstyle_result_xml:
        os.remove(file)

    if use_maven:
        pom = find_the_pom(dir)
        cmd = f'mvn -f {pom} clean checkstyle:checkstyle'
        process = subprocess.Popen(cmd.split(" "), stdout=subprocess.PIPE)
        output = process.communicate()[0]
        checkstyle_results = [
            checkstyle.parse_res(open_file(file))
            for file in find_all(dir, 'checkstyle-result.xml')
        ]
    else:
        # checkstyle_relative_dir = find_all(dir, checkstyle_path)
        checkstyle_dir = os.path.join(dir, checkstyle_path)
        if not os.path.isfile(checkstyle_dir):
            checkstyle_results = []
        else:
            output, returncode = checkstyle.check(
                checkstyle_file_path=checkstyle_dir, file_path=dir)
            checkstyle_results = [output]

    reports_with_errors = check_checkstyle_results(checkstyle_results)

    repo_name = dir.split('/')[-2] + "-" + dir.split('/')[-1]
    count = 0
    target = get_real_errors_commit_dir(repo_name, commit)
    for report_dir, results in reports_with_errors.items():
        # checkstyle_checker = f'{"/".join(report_dir.split("/")[:-1])}/checkstyle-checker.xml'
        for file, errors in results.items():
            print(f'{file} has {len(errors)} error(s)')
            file_name = file.split('/')[-1]
            dir = os.path.join(target, str(count))
            create_dir(dir)
            shutil.copyfile(file, os.path.join(dir, file_name))
            # shutil.copyfile(checkstyle_checker, os.path.join(dir, 'checkstyle.xml'))
            save_json(dir, 'errors.json', errors)
            count += 1

    print("# Files with at least one error: %s" % count)
    find_all(dir, 'checkstyle-checker.xml')
    return output
Ejemplo n.º 4
0
def repair_files(dir, dir_files, model_name, only_formatting=False):
    # set the dirs
    target = os.path.join(dir, 'repair-attempt')
    target_final = os.path.join(dir, 'files-repaired')
    checkstyle_rules = os.path.join(dir_files, 'checkstyle.xml')
    waste = os.path.join(dir, 'waste')

    # yet we focus on single error files
    # TODO : Improve it
    dir_files = os.path.join(dir_files, f'./1')
    
    # create the folders
    create_dir(target)
    create_dir(waste)

    # Init of the translator
    translate = gen_translator(model_name, batch_size=5, only_formatting=only_formatting)

    list_of_fileids = list_folders(dir_files)
    number_of_files = len(list_of_fileids)
    #list_of_fileids = []
    for folder_id in tqdm(list_of_fileids):
        file_path = glob.glob(f'{dir_files}/{folder_id}/*.java')[0]
        metadata_path = f'{dir_files}/{folder_id}/metadata.json'
        for error_id, error in enumerate(tokenize_errors(file_path, open_json(metadata_path)['errors'])):
            tokenized_errors, info = error
            for proposal_id, translation in enumerate(translate(tokenized_errors)):
                de_tokenized_translation = de_tokenize(file_path, info, translation, only_formatting=only_formatting)
                folder = f'{target}/batch_{proposal_id}/{int(folder_id) + error_id * number_of_files}'
                create_dir(folder)
                save_file(folder, file_path.split('/')[-1], de_tokenized_translation)

    move_parse_exception_files(target, waste)
    checkstyle_result, number_of_errors = checkstyle.check(checkstyle_rules, target, only_targeted=True)
    #json_pp(checkstyle_result)
    #save_json('./', 'test.json', checkstyle_result)
    files_properly_repaired = reverse_collection(get_batch_results(checkstyle_result))
    #print(files_properly_repaired)
    final_repairs = {
        id:select_the_best_repair(
            [ glob.glob(f'{target}/batch_{batch}/{id}/*.java')[0] for batch in repairs ],
            glob.glob(f'{dir_files}/{int(id) % number_of_files}/*.java')[0]
        )
        for id, repairs
        in files_properly_repaired.items()
    }
    json_pp(final_repairs)
    for id, path in final_repairs.items():
        folder = create_dir(f'{target_final}/{id}')
        shutil.copy(path, folder)
Ejemplo n.º 5
0
def get_checkstyle_results(tool,
                           dir,
                           only_targeted=False,
                           checkstyle_rules=None):
    """
    Return the checkstyle results
    """
    tool_dir = os.path.join(dir, tool)
    file_name = f'checkstyle_results_{tool}.json'
    result_file_dir = f'{dir}/{file_name}'
    results_json = {}
    if os.path.exists(result_file_dir):
        results_json = open_json(result_file_dir)
    else:
        if checkstyle_rules is None:
            checkstyle_rules = os.path.join(dir, 'checkstyle.xml')
        checkstyle_results, number_of_errors = checkstyle.check(
            checkstyle_rules, tool_dir, only_targeted=only_targeted)
        results_json['checkstyle_results'] = checkstyle_results
        results_json['number_of_errors'] = number_of_errors
        save_json(dir, file_name, results_json)
    return results_json['checkstyle_results'], results_json['number_of_errors']
Ejemplo n.º 6
0
def gen_errored(corpus, get_random_corpus_file, repo_name, goal, id,
                target_dir):
    DEBUG = False
    folder = os.path.join(target_dir, f'./{goal}/{id}')
    file = get_random_corpus_file(goal)
    file_dir = file[2]
    file_name = file[0].split('.')[0]
    done = False
    error = None
    ugly_file = ""
    max_attepts = 10
    attepts = 0
    while not done:
        if attepts >= max_attepts:  # it is ugly but it i made in order to avoid the loop to get stuck
            file = get_random_corpus_file(goal)
            file_dir = file[2]
            file_name = file[0].split('.')[0]
            attepts = 0
            continue
        if os.path.exists(folder):
            shutil.rmtree(folder)
        create_dir(folder)
        injection_operator = random.choice(
            list(injection_operator_types.keys()))
        ugly_file = os.path.join(folder, f'./{file_name}.java')
        modification = jlu.gen_ugly(
            file_dir,
            folder,
            modification_number=injection_operator_types[injection_operator])
        if DEBUG:
            print(modification)
        if not jlu.check_well_formed(ugly_file):
            if DEBUG:
                print('Not well formed')
            attepts = attepts + 1
            continue
        try:
            cs_result, number_of_errors = checkstyle.check(
                corpus.checkstyle, ugly_file)
        except:
            if DEBUG:
                print('Cant run checkstule')
            attepts = attepts + 1
            continue
        if number_of_errors != 1:
            if DEBUG:
                print(f'{number_of_errors} errors')
            attepts = attepts + 1
            continue
        spaces_original, tokens_original = jlu.tokenize_with_white_space(
            open_file(file_dir))
        spaces_errored, tokens_errored = jlu.tokenize_with_white_space(
            open_file(ugly_file))
        if len(tokens_original) != len(tokens_errored):
            if DEBUG:
                print(
                    f'Not the same length : orig {len(tokens_original)} vs {len(tokens_errored)}'
                )
            attepts = attepts + 1
            continue
        error = list(cs_result.values())[0]['errors'][0]
        done = True

    original_file = os.path.join(folder, f'./{file_name}-orig.java')
    if file_dir != original_file:
        shutil.copyfile(file_dir, original_file)
    save_file(folder, 'diff.diff', run_diff(original_file, ugly_file))

    report = {}
    report['injection_operator'] = injection_operator
    report['line'] = error['line']
    if 'column' in error:
        report['column'] = error['column']
    report['message'] = error['message']
    report['type'] = error['source'].split('.')[-1][:-5]

    save_json(folder, 'metadata.json', report)