Beispiel #1
0
def get_messages_from_log(commits_log: Path) -> List[Message]:
    result: List[Message] = []

    with open(commits_log, 'r') as f:
        for line in f:
            commits_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line, separator=COMMON_SEP)
            result.append(commits_log_line.message)

    return result
Beispiel #2
0
def messages_vs_author(commits_log: Path) -> Dict[int, Tuple[str, Message]]:
    message_vs_author_count: Dict[Tuple[str, Message],
                                  int] = collections.defaultdict(int)

    with open(commits_log, 'r') as f:
        for line in f:
            commits_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line, separator=COMMON_SEP)
            author = commits_log_line.author
            message = Message(" ".join(
                split_commit_message(commits_log_line.message)))
            message_vs_author_count[(author, message)] += 1

    count_vs_pair = invert_dict(message_vs_author_count)
    return collections.OrderedDict(sorted(count_vs_pair.items(), reverse=True))
def commit_msg_tokenizing_aurora(com_log: Path):
    msg_vs_counts: Dict[Message, int] = {}
    msgs: List[Message] = []

    with open(com_log, 'r') as f:
        for line in f:
            if line.startswith("parent_commit_file_hash"):
                continue

            com_line: CommitLogLine = CommitLogLine.parse_from_line(line)
            message: Message = com_line.message
            if message == "no message" or \
                    message == "New version" or \
                    message == "Build completed" or \
                    message == "Build failed" or \
                    message == "*** empty log message ***":
                continue
            else:
                msgs.append(message)
                message: Message = Message(message.lower())
                if message in msg_vs_counts:
                    msg_vs_counts[message] += 1
                else:
                    msg_vs_counts[message] = 1

    print(f"Unique message number {len(msg_vs_counts)}")

    counts_vs_msg = invert_dict(msg_vs_counts)
    counts_vs_msg_sorted = OrderedDict(
        sorted(counts_vs_msg.items(), reverse=True))
    top_popular_msg = dict(
        itertools.islice(counts_vs_msg_sorted.items(), 0, 20))

    for key, value in counts_vs_msg_sorted.items():
        print("{:>3}: {} {}".format(key, len(value), value))

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(msgs)

    counts_vs_word = invert_dict(tokenizer.word_counts)
    counts_vs_word_sorted = OrderedDict(
        sorted(counts_vs_word.items(), reverse=True))
    top_popular_words = dict(
        itertools.islice(counts_vs_word_sorted.items(), 0, 20))
    print(top_popular_words)
Beispiel #4
0
def get_commits_for_train_aurora(com_log: Path, empty_commits_file: Path,
                                 is_pickle: bool, commits_for_train_file: Path,
                                 new_com_log: Path):
    needed_commits: Set[Commit] = set()
    with open(empty_commits_file, 'rb') as file:
        empty_commits: Set[Commit] = pickle.load(file)

    with open(new_com_log, 'w') as new_log_file, open(com_log,
                                                      'r') as old_log_file:
        total_commit_number: int = 0

        for line in old_log_file:
            if line.startswith("parent_commit_file_hash"):
                continue

            commit_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line)
            if commit_log_line.current_commit in empty_commits:
                continue

            message = Message(commit_log_line.message.lower())
            if commit_log_line.author == "builder":
                if message == "new version" or \
                        message == "build completed" or \
                        message == "build failed":
                    continue
            if message == "no message" or \
                    message == "*** empty log message ***" or \
                    message.startswith("this commit was manufactured by cvs2svn"):
                continue
            text_list = text_to_word_sequence(message)
            if text_list:
                total_commit_number += 1
                needed_commits.add(commit_log_line.current_commit)
            new_log_file.write(f"{line}")

    print(f"Number of needed commits {len(needed_commits)}")

    if is_pickle:
        with open(commits_for_train_file, 'wb') as file:
            pickle.dump(needed_commits, file)
    else:
        with open(commits_for_train_file, 'w') as file:
            for commit in needed_commits:
                file.write(f"{commit}\n")
def commit_msg_tokenizing_camel(com_log: Path, full_log: Path):
    msg_vs_counts: Dict[Message, int] = {}
    msgs: List[Message] = []

    commits_from_full_log: Set[Commit] = get_commits_from_full_log(full_log)

    with open(com_log, 'r') as f:
        for line in f:
            if line.startswith("parent_commit_file_hash"):
                continue

            com_log_line: CommitLogLine = CommitLogLine.parse_from_line(line)
            if com_log_line.current_commit in commits_from_full_log:
                message: Message = com_log_line.message
                msgs.append(message)
                message: Message = Message(message.lower())
                if message in msg_vs_counts:
                    msg_vs_counts[message] += 1
                else:
                    msg_vs_counts[message] = 1

    print(f"Unique message number {len(msg_vs_counts)}")

    counts_vs_msg = invert_dict(msg_vs_counts)
    counts_vs_msg_sorted = OrderedDict(
        sorted(counts_vs_msg.items(), reverse=True))
    top_popular_msg = dict(
        itertools.islice(counts_vs_msg_sorted.items(), 0, 20))

    for key, value in counts_vs_msg_sorted.items():
        print("{:>3}: {} {}".format(key, len(value), value))

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(msgs)

    counts_vs_word = invert_dict(tokenizer.word_counts)
    counts_vs_word_sorted = OrderedDict(
        sorted(counts_vs_word.items(), reverse=True))
    top_popular_words = dict(
        itertools.islice(counts_vs_word_sorted.items(), 0, 30))
    print(top_popular_words)
Beispiel #6
0
def get_commits_for_train_intellij(com_log: Path, empty_commits_file: Path,
                                   is_pickle: bool,
                                   commits_for_train_file: Path,
                                   new_com_log: Path):
    needed_commits: Set[Commit] = set()
    with open(empty_commits_file, 'rb') as file:
        empty_commits: Set[Commit] = pickle.load(file)

    with open(new_com_log, 'w') as new_com_log_file, open(com_log,
                                                          'r') as com_log_file:
        total_commit_number: int = 0
        for line in com_log_file:
            if line.startswith("parent_commit_file_hash"):
                continue

            commit_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line)

            if commit_log_line.current_commit in empty_commits:
                continue
            message = Message(commit_log_line.message.lower())
            if message == "(no message)":
                continue
            text_list = text_to_word_sequence(message)
            if text_list:
                total_commit_number += 1
                needed_commits.add(commit_log_line.current_commit)
            new_com_log_file.write(f"{line}")

    print(f"Number of needed commits {len(needed_commits)}")

    if is_pickle:
        with open(commits_for_train_file, 'wb') as file:
            pickle.dump(needed_commits, file)
    else:
        with open(commits_for_train_file, 'w') as file:
            for commit in needed_commits:
                file.write(f"{commit}\n")
Beispiel #7
0
def get_changed_java_files_log(git_dir: Path, output: Path, commits_log: Path):
    global COMMON_SEP
    total_commit_number: int = subprocess.check_output(
        f'wc -l {commits_log}'.split()).decode().split()[0]
    commits_with_changed_java_files_number: int = 0
    commit_number: int = 0
    with open(output, 'w', encoding='utf-8') as output_f, open(
            commits_log, 'r', encoding='utf-8') as commits_log_f:
        for line in commits_log_f:
            commit_number += 1
            if commit_number % 1000 == 0:
                print(
                    f'Start to process {commit_number} commit from {total_commit_number}'
                )
            if commits_with_changed_java_files_number > 7_400:
                break
            commits_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line, separator=COMMON_SEP)
            changed_files: List[ChangedFile] = run_and_parse_diff_tree(
                commits_log_line.parent_commit,
                commits_log_line.current_commit, git_dir)
            changed_files = list(filter(is_java_file, changed_files))
            if changed_files:
                commits_with_changed_java_files_number += 1
                for changed_file in changed_files:
                    output_f.write(
                        changed_files_log_line.substitute(
                            commit_hash=commits_log_line.current_commit,
                            author=commits_log_line.author,
                            status=changed_file.status,
                            file_name=changed_file.file_name,
                            old_blob=changed_file.old_blob,
                            new_blob=changed_file.new_blob,
                            message=commits_log_line.message,
                            sep=COMMON_SEP))
    print(
        f"Number of commits with changed java files is {commits_with_changed_java_files_number}"
    )