def get_messages_from_log(commits_log: Path) -> List[Message]: result: List[Message] = [] with open(commits_log, 'r') as f: for line in f: commits_log_line: CommitLogLine = CommitLogLine.parse_from_line( line, separator=COMMON_SEP) result.append(commits_log_line.message) return result
def messages_vs_author(commits_log: Path) -> Dict[int, Tuple[str, Message]]: message_vs_author_count: Dict[Tuple[str, Message], int] = collections.defaultdict(int) with open(commits_log, 'r') as f: for line in f: commits_log_line: CommitLogLine = CommitLogLine.parse_from_line( line, separator=COMMON_SEP) author = commits_log_line.author message = Message(" ".join( split_commit_message(commits_log_line.message))) message_vs_author_count[(author, message)] += 1 count_vs_pair = invert_dict(message_vs_author_count) return collections.OrderedDict(sorted(count_vs_pair.items(), reverse=True))
def commit_msg_tokenizing_aurora(com_log: Path): msg_vs_counts: Dict[Message, int] = {} msgs: List[Message] = [] with open(com_log, 'r') as f: for line in f: if line.startswith("parent_commit_file_hash"): continue com_line: CommitLogLine = CommitLogLine.parse_from_line(line) message: Message = com_line.message if message == "no message" or \ message == "New version" or \ message == "Build completed" or \ message == "Build failed" or \ message == "*** empty log message ***": continue else: msgs.append(message) message: Message = Message(message.lower()) if message in msg_vs_counts: msg_vs_counts[message] += 1 else: msg_vs_counts[message] = 1 print(f"Unique message number {len(msg_vs_counts)}") counts_vs_msg = invert_dict(msg_vs_counts) counts_vs_msg_sorted = OrderedDict( sorted(counts_vs_msg.items(), reverse=True)) top_popular_msg = dict( itertools.islice(counts_vs_msg_sorted.items(), 0, 20)) for key, value in counts_vs_msg_sorted.items(): print("{:>3}: {} {}".format(key, len(value), value)) tokenizer = Tokenizer() tokenizer.fit_on_texts(msgs) counts_vs_word = invert_dict(tokenizer.word_counts) counts_vs_word_sorted = OrderedDict( sorted(counts_vs_word.items(), reverse=True)) top_popular_words = dict( itertools.islice(counts_vs_word_sorted.items(), 0, 20)) print(top_popular_words)
def get_commits_for_train_aurora(com_log: Path, empty_commits_file: Path, is_pickle: bool, commits_for_train_file: Path, new_com_log: Path): needed_commits: Set[Commit] = set() with open(empty_commits_file, 'rb') as file: empty_commits: Set[Commit] = pickle.load(file) with open(new_com_log, 'w') as new_log_file, open(com_log, 'r') as old_log_file: total_commit_number: int = 0 for line in old_log_file: if line.startswith("parent_commit_file_hash"): continue commit_log_line: CommitLogLine = CommitLogLine.parse_from_line( line) if commit_log_line.current_commit in empty_commits: continue message = Message(commit_log_line.message.lower()) if commit_log_line.author == "builder": if message == "new version" or \ message == "build completed" or \ message == "build failed": continue if message == "no message" or \ message == "*** empty log message ***" or \ message.startswith("this commit was manufactured by cvs2svn"): continue text_list = text_to_word_sequence(message) if text_list: total_commit_number += 1 needed_commits.add(commit_log_line.current_commit) new_log_file.write(f"{line}") print(f"Number of needed commits {len(needed_commits)}") if is_pickle: with open(commits_for_train_file, 'wb') as file: pickle.dump(needed_commits, file) else: with open(commits_for_train_file, 'w') as file: for commit in needed_commits: file.write(f"{commit}\n")
def commit_msg_tokenizing_camel(com_log: Path, full_log: Path): msg_vs_counts: Dict[Message, int] = {} msgs: List[Message] = [] commits_from_full_log: Set[Commit] = get_commits_from_full_log(full_log) with open(com_log, 'r') as f: for line in f: if line.startswith("parent_commit_file_hash"): continue com_log_line: CommitLogLine = CommitLogLine.parse_from_line(line) if com_log_line.current_commit in commits_from_full_log: message: Message = com_log_line.message msgs.append(message) message: Message = Message(message.lower()) if message in msg_vs_counts: msg_vs_counts[message] += 1 else: msg_vs_counts[message] = 1 print(f"Unique message number {len(msg_vs_counts)}") counts_vs_msg = invert_dict(msg_vs_counts) counts_vs_msg_sorted = OrderedDict( sorted(counts_vs_msg.items(), reverse=True)) top_popular_msg = dict( itertools.islice(counts_vs_msg_sorted.items(), 0, 20)) for key, value in counts_vs_msg_sorted.items(): print("{:>3}: {} {}".format(key, len(value), value)) tokenizer = Tokenizer() tokenizer.fit_on_texts(msgs) counts_vs_word = invert_dict(tokenizer.word_counts) counts_vs_word_sorted = OrderedDict( sorted(counts_vs_word.items(), reverse=True)) top_popular_words = dict( itertools.islice(counts_vs_word_sorted.items(), 0, 30)) print(top_popular_words)
def get_commits_for_train_intellij(com_log: Path, empty_commits_file: Path, is_pickle: bool, commits_for_train_file: Path, new_com_log: Path): needed_commits: Set[Commit] = set() with open(empty_commits_file, 'rb') as file: empty_commits: Set[Commit] = pickle.load(file) with open(new_com_log, 'w') as new_com_log_file, open(com_log, 'r') as com_log_file: total_commit_number: int = 0 for line in com_log_file: if line.startswith("parent_commit_file_hash"): continue commit_log_line: CommitLogLine = CommitLogLine.parse_from_line( line) if commit_log_line.current_commit in empty_commits: continue message = Message(commit_log_line.message.lower()) if message == "(no message)": continue text_list = text_to_word_sequence(message) if text_list: total_commit_number += 1 needed_commits.add(commit_log_line.current_commit) new_com_log_file.write(f"{line}") print(f"Number of needed commits {len(needed_commits)}") if is_pickle: with open(commits_for_train_file, 'wb') as file: pickle.dump(needed_commits, file) else: with open(commits_for_train_file, 'w') as file: for commit in needed_commits: file.write(f"{commit}\n")
def get_changed_java_files_log(git_dir: Path, output: Path, commits_log: Path): global COMMON_SEP total_commit_number: int = subprocess.check_output( f'wc -l {commits_log}'.split()).decode().split()[0] commits_with_changed_java_files_number: int = 0 commit_number: int = 0 with open(output, 'w', encoding='utf-8') as output_f, open( commits_log, 'r', encoding='utf-8') as commits_log_f: for line in commits_log_f: commit_number += 1 if commit_number % 1000 == 0: print( f'Start to process {commit_number} commit from {total_commit_number}' ) if commits_with_changed_java_files_number > 7_400: break commits_log_line: CommitLogLine = CommitLogLine.parse_from_line( line, separator=COMMON_SEP) changed_files: List[ChangedFile] = run_and_parse_diff_tree( commits_log_line.parent_commit, commits_log_line.current_commit, git_dir) changed_files = list(filter(is_java_file, changed_files)) if changed_files: commits_with_changed_java_files_number += 1 for changed_file in changed_files: output_f.write( changed_files_log_line.substitute( commit_hash=commits_log_line.current_commit, author=commits_log_line.author, status=changed_file.status, file_name=changed_file.file_name, old_blob=changed_file.old_blob, new_blob=changed_file.new_blob, message=commits_log_line.message, sep=COMMON_SEP)) print( f"Number of commits with changed java files is {commits_with_changed_java_files_number}" )