def write_commit_message_and_all_changed_functions( message: Message, changed_functions: Set[Tuple[FunctionInfo, FunctionInfo]], max_functions_count_per_commit: int, output_file: TextIO) -> bool: paths_max_number: int = 200 max_paths_per_commit: int = paths_max_number * max_functions_count_per_commit actual_function_count: int = len(changed_functions) changed_functions: List[Tuple[FunctionInfo, FunctionInfo]] = list(changed_functions) if actual_function_count > max_functions_count_per_commit: ind = np.random.choice(len(changed_functions), max_functions_count_per_commit, replace=False) changed_functions = list(itemgetter(*ind)(changed_functions)) all_functions_paths: List[str] = get_all_diff_paths( changed_functions, paths_max_number) all_functions_paths.extend( pad_paths_per_commit(max_paths_per_commit - len(all_functions_paths))) splitted_message: List[str] = split_commit_message(message) if splitted_message: output_file.write( dataset_line.substitute(target_message='|'.join(splitted_message), paths=' '.join(all_functions_paths))) return True return False
def messages_frequency(messages: List[Message]) -> Dict[int, Message]: message_vs_count: Dict[Message, int] = collections.defaultdict(int) for message in messages: message_vs_count[Message(" ".join(split_commit_message(message)))] += 1 count_vs_msg = invert_dict(message_vs_count) return collections.OrderedDict(sorted(count_vs_msg.items(), reverse=True))
def get_common_diff_from_two_input(split_log: Path, output_file: Path) -> None: with open(split_log, 'r') as f: data = json.load(f) print(data['TRAIN'][0]) train_data = [CommitDiff.from_dict(commit) for commit in data['TRAIN']] test_data = [CommitDiff.from_dict(commit) for commit in data['TEST']] val_data = [CommitDiff.from_dict(commit) for commit in data['VAL']] print(f'train {len(train_data)}, test {len(test_data)}, val {len(val_data)}') for commit in tqdm(train_data): for i, changed_file in enumerate(commit.changed_java_files): commit.message = " ".join(split_commit_message(commit.message)) commit.changed_java_files[i].diff_body_common = keep_only_needed_number_of_line_around_changes(commit .changed_java_files[i] .diff_body_common) commit.changed_java_files[i].diff_body_common = tokenize_common_diff(commit.changed_java_files[i].diff_body_common) for commit in tqdm(test_data): for i, changed_file in enumerate(commit.changed_java_files): commit.message = " ".join(split_commit_message(commit.message)) commit.changed_java_files[i].diff_body_common = keep_only_needed_number_of_line_around_changes(commit .changed_java_files[i] .diff_body_common) commit.changed_java_files[i].diff_body_common = tokenize_common_diff( commit.changed_java_files[i].diff_body_common) for commit in tqdm(val_data): for i, changed_file in enumerate(commit.changed_java_files): commit.message = " ".join(split_commit_message(commit.message)) commit.changed_java_files[i].diff_body_common = keep_only_needed_number_of_line_around_changes(commit .changed_java_files[i] .diff_body_common) commit.changed_java_files[i].diff_body_common = tokenize_common_diff( commit.changed_java_files[i].diff_body_common) with open(output_file, 'w') as f: f.write(json.dumps({'TRAIN': train_data, 'TEST': test_data, 'VAL': val_data}, default=CommitDiff.to_json, indent=2))
def messages_vs_author(commits_log: Path) -> Dict[int, Tuple[str, Message]]: message_vs_author_count: Dict[Tuple[str, Message], int] = collections.defaultdict(int) with open(commits_log, 'r') as f: for line in f: commits_log_line: CommitLogLine = CommitLogLine.parse_from_line( line, separator=COMMON_SEP) author = commits_log_line.author message = Message(" ".join( split_commit_message(commits_log_line.message))) message_vs_author_count[(author, message)] += 1 count_vs_pair = invert_dict(message_vs_author_count) return collections.OrderedDict(sorted(count_vs_pair.items(), reverse=True))
def filter_by_clipping_commit_message_len(commit: CommitDiff, min_message_tokens: int, max_message_tokens: int) -> bool: return min_message_tokens <= len(split_commit_message( commit.message)) <= max_message_tokens