Beispiel #1
0
def tokenize_errored_file(file, file_orig, error):
    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file))
    token_started = False
    from_token = -1
    to_token = -1
    count = 0
    tokens_errored = []
    n_lines = 5
    for token, space in zip(tokens, spaces):
        if not token_started and int(error['line']) == token.position[0]:
            token_started = True
            tokens_errored.append(f'<{error["type"]}>')
            from_token = count
        if token_started and int(error['line']) < token.position[0]:
            token_started = False
            tokens_errored.append(f'</{error["type"]}>')
            to_token = count
        if token.position[0] >= int(
                error['line']) - n_lines and token.position[0] <= int(
                    error['line']) + n_lines:
            tokens_errored.append(get_token_value(token))
            tokens_errored.append(get_space_value(space))
        count += 1
    if from_token == -1:
        tokens_errored.append(f'<{error["type"]}>')
        tokens_errored.append(f'</{error["type"]}>')

    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_orig))
    tokens_correct = []
    for token, space in zip(tokens[from_token:to_token],
                            spaces[from_token:to_token]):
        tokens_correct.append(get_token_value(token))
        tokens_correct.append(get_space_value(space))
    return tokens_errored, tokens_correct
Beispiel #2
0
def check_token_length(repo):
    synthetic_errors = load_repo(repo)
    not_good = []
    for error in tqdm(synthetic_errors, desc=dataset):
        spaces_original, tokens_original = jlu.tokenize_with_white_space(
            error.get_original())
        spaces_errored, tokens_errored = jlu.tokenize_with_white_space(
            error.get_errored())
        if len(tokens_original) != len(tokens_errored):
            not_good.append({
                'type': error.type,
                'id': error.id,
                'error': error.get_metadata()['type']
            })
    return not_good
Beispiel #3
0
def de_tokenize(errored_source,
                error_info,
                new_tokens,
                tabulations,
                only_formatting=False):
    whitespace, tokens = jlu.tokenize_with_white_space(errored_source)
    from_token = error_info['from_token']
    to_token = error_info['to_token']

    if only_formatting:
        new_white_space_tokens = new_tokens
    else:
        new_white_space_tokens = new_tokens[1::2]
    # print(new_white_space_tokens)
    new_white_space = [
        token_utils.whitespace_token_to_tuple(token)
        for token in new_white_space_tokens
    ]
    # print(new_white_space)

    # whitespace[from_token:to_token] = new_white_space
    # whitespace[from_token:min(from_token + len(new_white_space),to_token)] = new_white_space[:min(to_token - from_token, len(new_white_space))]
    for index in range(min(to_token - from_token, len(new_white_space))):
        whitespace[from_token + index] = new_white_space[index]

    result = jlu.reformat(whitespace, tokens, tabulations=tabulations)

    if 'error' in error_info:
        line = int(error_info['error']['line'])
        return jlu.mix_sources(errored_source,
                               result,
                               line - 1,
                               to_line=line + 1)
    else:
        return result  #jlu.mix_sources(errored_source, result, tokens[from_token].position[0], to_line=tokens[to_token].position[0])
Beispiel #4
0
def tokenize_errored_file_model2(file, file_orig, error):

    # else:
    #     for token, space in zip(tokens[start:end], spaces[start:end]):
    #         tokens_errored.append(get_token_value(token))
    #         tokens_errored.append(get_space_value(space))
    #     tokens_errored.append(f'<{error["type"]}>')
    #     tokens_errored.append(f'</{error["type"]}>')

    tokens_errored, info = tokenize_file_to_repair(file, error)

    tokens_errored_in_tag = info['tokens_errored_in_tag']
    from_token = info['from_token']
    to_token = info['to_token']

    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_orig))
    tokens_correct = []

    for token, space in zip(tokens[from_token:to_token],
                            spaces[from_token:to_token]):
        tokens_correct.append(get_token_value(token))
        tokens_correct.append(get_space_value(space))

    if len(tokens_errored_in_tag) != len(tokens_correct):
        print("WHAAAAATT")
    info['count_diff'] = 0
    for t_A, t_B in zip(tokens_errored_in_tag, tokens_correct):
        if t_A != t_B:
            info['count_diff'] += 1

    return tokens_errored, tokens_correct, tokens_errored_in_tag, info
Beispiel #5
0
def vectorize_file(path, vectorizer):
    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(path))

    result = []
    for ws, t in zip(spaces, tokens):
        result.append(vectorizer(ws, t))

    return result
Beispiel #6
0
def build_vocabulary(files):
    count = {}
    tokenized_files = [
        jlu.tokenize_with_white_space(jlu.open_file(path)) for path in files
    ]
    whitespace_id = set()

    threshold = 30

    for spaces, tokens in tokenized_files:
        whitespace_id = set(spaces) | whitespace_id
        for token in tokens:
            name = get_token_value(token)
            if not name in count:
                count[name] = 0
            count[name] += 1

    litterals = list(filter(lambda key: count[key] >= threshold, count.keys()))
    litterals = {
        key: value
        for key, value in zip(litterals, range(len(litterals)))
    }

    whitespace_id = {
        key: value
        for key, value in zip(whitespace_id, range(len(whitespace_id)))
    }

    len_litterals = len(litterals)
    len_whitespace = len(whitespace_id)
    vec_size = len_litterals + 1 + len_whitespace

    def get_vector(space, token):
        vector = np.array([0] * vec_size)
        if get_token_value(token) in litterals:
            vector[litterals[get_token_value(token)]] = 1
        else:
            vector[len_litterals] = 1
        vector[len_litterals + 1 + whitespace_id[space]] = 1
        return vector

    print(litterals.keys())

    return get_vector, whitespace_id
Beispiel #7
0
def match_input_to_source(source, error_info, input):
    whitespace, tokens = jlu.tokenize_with_white_space(source)
    start = error_info['start']
    end = error_info['end']

    sub_sequence = tokens[start:end]
    ws_sub_sequence = whitespace[start:end]

    result = []
    count = 0
    ws_count = 0
    for input_token in input.split(' '):
        if token_utils.is_whitespace_token(input_token):
            result.append(
                (input_token, get_space_value(ws_sub_sequence[ws_count])))
            ws_count += 1
        elif input_token.startswith('<') and input_token.endswith('>'):
            result.append((input_token, input_token))
        else:
            result.append((input_token, sub_sequence[count].value))
            count += 1

    return result
Beispiel #8
0
def tokenize_file_to_repair(file_path, error):
    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_path))

    info = {}

    token_started = False
    token_line_start = -1
    token_line_end = -1
    count = 0

    tokens_errored = []
    n_lines = 6

    start = len(tokens)
    end = 0

    from_token = 0
    to_token = 0

    for token, space in zip(tokens, spaces):
        if token.position[0] >= int(
                error['line']) - n_lines and token.position[0] <= int(
                    error['line']) + n_lines:
            start = min(count, start)
            end = max(count, end)
        if not token_started and int(error['line']) == token.position[0]:
            token_started = True
            token_line_start = count
        if token_started and int(error['line']) < token.position[0]:
            token_started = False
            token_line_end = count
        count += 1
    start = max(0, start - 2)
    end = min(len(tokens), end + 2)
    if token_line_end == -1:
        token_line_end = token_line_start

    # print(error)

    if 'column' in error and error['type'] != 'OneStatementPerLine':
        errored_token_index = -1
        around = 10
        for token, index in zip(tokens, range(len(tokens))):
            if token.position[0] <= int(
                    error['line']) and token.position[1] <= int(
                        error['column']):
                errored_token_index = index
        from_token = max(0, errored_token_index - around)
        to_token = min(len(tokens), errored_token_index + 1 + around)
    else:
        around = 2
        around_after = 13
        errored_token_index = -1
        if token_line_start != -1:
            from_token = max(start, token_line_start - around)
            to_token = min(end, token_line_end + around_after + 1)
        else:
            errored_token_index = -1
            around = 2
            around_after = 18
            for token, index in zip(tokens, range(len(tokens))):
                if token.position[0] < int(error['line']):
                    errored_token_index = index
            from_token = max(0, errored_token_index - around)
            to_token = min(len(tokens), errored_token_index + 1 + around_after)
    tokens_errored_in_tag = []
    for token, space in zip(tokens[from_token:to_token],
                            spaces[from_token:to_token]):
        tokens_errored_in_tag.append(get_token_value(token))
        tokens_errored_in_tag.append(get_space_value(space))

    for token, space in zip(tokens[start:from_token],
                            spaces[start:from_token]):
        tokens_errored.append(get_token_value(token))
        tokens_errored.append(get_space_value(space))
    tokens_errored.append(f'<{error["type"]}>')
    for token, space in zip(tokens[from_token:to_token],
                            spaces[from_token:to_token]):
        tokens_errored.append(get_token_value(token))
        tokens_errored.append(get_space_value(space))
    tokens_errored.append(f'</{error["type"]}>')
    for token, space in zip(tokens[to_token:end], spaces[to_token:end]):
        tokens_errored.append(get_token_value(token))
        tokens_errored.append(get_space_value(space))

    info['from_token'] = from_token
    info['to_token'] = to_token
    info['start'] = start
    info['end'] = end
    info['error'] = error
    info['tokens_errored_in_tag'] = tokens_errored_in_tag

    return tokens_errored, info
Beispiel #9
0
def gen_errored(corpus, get_random_corpus_file, repo_name, goal, id,
                target_dir):
    DEBUG = False
    folder = os.path.join(target_dir, f'./{goal}/{id}')
    file = get_random_corpus_file(goal)
    file_dir = file[2]
    file_name = file[0].split('.')[0]
    done = False
    error = None
    ugly_file = ""
    max_attepts = 10
    attepts = 0
    while not done:
        if attepts >= max_attepts:  # it is ugly but it i made in order to avoid the loop to get stuck
            file = get_random_corpus_file(goal)
            file_dir = file[2]
            file_name = file[0].split('.')[0]
            attepts = 0
            continue
        if os.path.exists(folder):
            shutil.rmtree(folder)
        create_dir(folder)
        injection_operator = random.choice(
            list(injection_operator_types.keys()))
        ugly_file = os.path.join(folder, f'./{file_name}.java')
        modification = jlu.gen_ugly(
            file_dir,
            folder,
            modification_number=injection_operator_types[injection_operator])
        if DEBUG:
            print(modification)
        if not jlu.check_well_formed(ugly_file):
            if DEBUG:
                print('Not well formed')
            attepts = attepts + 1
            continue
        try:
            cs_result, number_of_errors = checkstyle.check(
                corpus.checkstyle, ugly_file)
        except:
            if DEBUG:
                print('Cant run checkstule')
            attepts = attepts + 1
            continue
        if number_of_errors != 1:
            if DEBUG:
                print(f'{number_of_errors} errors')
            attepts = attepts + 1
            continue
        spaces_original, tokens_original = jlu.tokenize_with_white_space(
            open_file(file_dir))
        spaces_errored, tokens_errored = jlu.tokenize_with_white_space(
            open_file(ugly_file))
        if len(tokens_original) != len(tokens_errored):
            if DEBUG:
                print(
                    f'Not the same length : orig {len(tokens_original)} vs {len(tokens_errored)}'
                )
            attepts = attepts + 1
            continue
        error = list(cs_result.values())[0]['errors'][0]
        done = True

    original_file = os.path.join(folder, f'./{file_name}-orig.java')
    if file_dir != original_file:
        shutil.copyfile(file_dir, original_file)
    save_file(folder, 'diff.diff', run_diff(original_file, ugly_file))

    report = {}
    report['injection_operator'] = injection_operator
    report['line'] = error['line']
    if 'column' in error:
        report['column'] = error['column']
    report['message'] = error['message']
    report['type'] = error['source'].split('.')[-1][:-5]

    save_json(folder, 'metadata.json', report)