Example #1
0
def write_commit_message_and_all_changed_functions(
        message: Message, changed_functions: Set[Tuple[FunctionInfo,
                                                       FunctionInfo]],
        max_functions_count_per_commit: int, output_file: TextIO) -> bool:
    paths_max_number: int = 200
    max_paths_per_commit: int = paths_max_number * max_functions_count_per_commit
    actual_function_count: int = len(changed_functions)
    changed_functions: List[Tuple[FunctionInfo,
                                  FunctionInfo]] = list(changed_functions)

    if actual_function_count > max_functions_count_per_commit:
        ind = np.random.choice(len(changed_functions),
                               max_functions_count_per_commit,
                               replace=False)
        changed_functions = list(itemgetter(*ind)(changed_functions))

    all_functions_paths: List[str] = get_all_diff_paths(
        changed_functions, paths_max_number)
    all_functions_paths.extend(
        pad_paths_per_commit(max_paths_per_commit - len(all_functions_paths)))

    splitted_message: List[str] = split_commit_message(message)
    if splitted_message:
        output_file.write(
            dataset_line.substitute(target_message='|'.join(splitted_message),
                                    paths=' '.join(all_functions_paths)))
        return True
    return False
Example #2
0
def messages_frequency(messages: List[Message]) -> Dict[int, Message]:
    message_vs_count: Dict[Message, int] = collections.defaultdict(int)

    for message in messages:
        message_vs_count[Message(" ".join(split_commit_message(message)))] += 1

    count_vs_msg = invert_dict(message_vs_count)
    return collections.OrderedDict(sorted(count_vs_msg.items(), reverse=True))
Example #3
0
def get_common_diff_from_two_input(split_log: Path, output_file: Path) -> None:
    with open(split_log, 'r') as f:
        data = json.load(f)
    print(data['TRAIN'][0])
    train_data = [CommitDiff.from_dict(commit) for commit in data['TRAIN']]
    test_data = [CommitDiff.from_dict(commit) for commit in data['TEST']]
    val_data = [CommitDiff.from_dict(commit) for commit in data['VAL']]

    print(f'train {len(train_data)}, test {len(test_data)}, val {len(val_data)}')

    for commit in tqdm(train_data):
        for i, changed_file in enumerate(commit.changed_java_files):
            commit.message = " ".join(split_commit_message(commit.message))
            commit.changed_java_files[i].diff_body_common = keep_only_needed_number_of_line_around_changes(commit
                                                                                                           .changed_java_files[i]
                                                                                                           .diff_body_common)
            commit.changed_java_files[i].diff_body_common = tokenize_common_diff(commit.changed_java_files[i].diff_body_common)

    for commit in tqdm(test_data):
        for i, changed_file in enumerate(commit.changed_java_files):
            commit.message = " ".join(split_commit_message(commit.message))
            commit.changed_java_files[i].diff_body_common = keep_only_needed_number_of_line_around_changes(commit
                                                                                                           .changed_java_files[i]
                                                                                                           .diff_body_common)
            commit.changed_java_files[i].diff_body_common = tokenize_common_diff(
                commit.changed_java_files[i].diff_body_common)

    for commit in tqdm(val_data):
        for i, changed_file in enumerate(commit.changed_java_files):
            commit.message = " ".join(split_commit_message(commit.message))
            commit.changed_java_files[i].diff_body_common = keep_only_needed_number_of_line_around_changes(commit
                                                                                                           .changed_java_files[i]
                                                                                                           .diff_body_common)
            commit.changed_java_files[i].diff_body_common = tokenize_common_diff(
                commit.changed_java_files[i].diff_body_common)

    with open(output_file, 'w') as f:
        f.write(json.dumps({'TRAIN': train_data, 'TEST': test_data, 'VAL': val_data},
                           default=CommitDiff.to_json,
                           indent=2))
Example #4
0
def messages_vs_author(commits_log: Path) -> Dict[int, Tuple[str, Message]]:
    message_vs_author_count: Dict[Tuple[str, Message],
                                  int] = collections.defaultdict(int)

    with open(commits_log, 'r') as f:
        for line in f:
            commits_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line, separator=COMMON_SEP)
            author = commits_log_line.author
            message = Message(" ".join(
                split_commit_message(commits_log_line.message)))
            message_vs_author_count[(author, message)] += 1

    count_vs_pair = invert_dict(message_vs_author_count)
    return collections.OrderedDict(sorted(count_vs_pair.items(), reverse=True))
Example #5
0
def filter_by_clipping_commit_message_len(commit: CommitDiff,
                                          min_message_tokens: int,
                                          max_message_tokens: int) -> bool:
    return min_message_tokens <= len(split_commit_message(
        commit.message)) <= max_message_tokens