def get_data(ref_file: Path, pred_file: Path) -> (List[Message], List[Message]): all_ref: List[Message] = [] with open(ref_file, 'r') as r: for line in r: all_ref.append(Message(line)) all_pred: List[Message] = [] with open(pred_file, 'r') as pr: for line in pr: all_pred.append(Message(line)) return all_ref, all_pred
def get_diffs(changed_files_log: Path, output: Path, context_size: int, git_dir: Path): commits_diffs: List[CommitDiff] = [] commit_vs_blobs: Dict[Commit, List[FullLogLine]] = get_commit_vs_blobs( changed_files_log, sep=COMMON_SEP) print(len(commit_vs_blobs.keys())) i = 0 for commit, changed_files in tqdm(commit_vs_blobs.items()): i += 1 # if i % 1000 == 0: # print(f"At {i}") # if i > 100: # break message = Message(changed_files[0].message) author = changed_files[0].author files_diffs = get_all_diffs_per_commit(changed_files, context_size, git_dir) for file_diff in files_diffs: file_diff.delete_useless_git_diff_output() file_diff.tokenize_each_line_of_diff_body() commits_diffs.append( CommitDiff(commit=commit, message=message, changed_java_files=files_diffs)) with open(output, 'w', encoding='utf-8') as output_f: output_f.write( json.dumps(commits_diffs, default=CommitDiff.to_json, indent=2))
def messages_frequency(messages: List[Message]) -> Dict[int, Message]: message_vs_count: Dict[Message, int] = collections.defaultdict(int) for message in messages: message_vs_count[Message(" ".join(split_commit_message(message)))] += 1 count_vs_msg = invert_dict(message_vs_count) return collections.OrderedDict(sorted(count_vs_msg.items(), reverse=True))
def get_commits_for_train_aurora(com_log: Path, empty_commits_file: Path, is_pickle: bool, commits_for_train_file: Path, new_com_log: Path): needed_commits: Set[Commit] = set() with open(empty_commits_file, 'rb') as file: empty_commits: Set[Commit] = pickle.load(file) with open(new_com_log, 'w') as new_log_file, open(com_log, 'r') as old_log_file: total_commit_number: int = 0 for line in old_log_file: if line.startswith("parent_commit_file_hash"): continue commit_log_line: CommitLogLine = CommitLogLine.parse_from_line( line) if commit_log_line.current_commit in empty_commits: continue message = Message(commit_log_line.message.lower()) if commit_log_line.author == "builder": if message == "new version" or \ message == "build completed" or \ message == "build failed": continue if message == "no message" or \ message == "*** empty log message ***" or \ message.startswith("this commit was manufactured by cvs2svn"): continue text_list = text_to_word_sequence(message) if text_list: total_commit_number += 1 needed_commits.add(commit_log_line.current_commit) new_log_file.write(f"{line}") print(f"Number of needed commits {len(needed_commits)}") if is_pickle: with open(commits_for_train_file, 'wb') as file: pickle.dump(needed_commits, file) else: with open(commits_for_train_file, 'w') as file: for commit in needed_commits: file.write(f"{commit}\n")
def from_dict(input_) -> 'CommitDiff': if 'diff_body_common' in input_['changed_java_files'][0]: return CommitDiff( commit=Commit(input_['commit']), message=Message(input_['message']), # author=input_['author'], changed_java_files=[ FileDiffWithTwoInput.from_dict(file) for file in input_['changed_java_files'] ], # is_there_dobj=input_['is_there_dobj']) is_there_dobj=True) else: return CommitDiff( commit=Commit(input_['commit']), message=Message(input_['message']), # author=input_['author'], changed_java_files=[ FileDiff.from_dict(file) for file in input_['changed_java_files'] ], is_there_dobj=input_['is_there_dobj'])
def replace_target_with_message(paths_file, common_log_file, output_dir): print('Start parsing paths file') a = time() blobs_positions: Dict[Blob, List[Tuple[int, int]]] = get_blobs_positions(paths_file) b = time() print('Finished parsing paths file') print(f'Number of blobs is {len(blobs_positions)}, Time {b - a}') repo_commit_vs_repo_blobs: Dict[ Commit, List[FullLogLine]] = get_repo_commit_vs_repo_blobs(common_log_file, sep=COMMON_SEP) output_file: Path = output_dir.joinpath('full_dataset.txt') output_log: Path = output_dir.joinpath('c2s_commits.log') len_changed_functions: List[int] = [] i = 0 with open(output_file, 'w') as out_f, open(output_log, 'w') as out_l, open(paths_file, 'rb') as paths_f: for commit, changed_files in tqdm(repo_commit_vs_repo_blobs.items()): i += 1 if i % 1000 == 0: print( f"mean = {np.mean(len_changed_functions)}, " f"median = {np.median(len_changed_functions)}\n" f"60 percentile = {np.percentile(np.array(len_changed_functions), 60)}\n" f"70 percentile = {np.percentile(np.array(len_changed_functions), 70)}\n" f"80 percentile = {np.percentile(np.array(len_changed_functions), 80)}\n" f"90 percentile = {np.percentile(np.array(len_changed_functions), 90)}" ) changed_functions: Set[Tuple[FunctionInfo, FunctionInfo]] = set() for changed_file in changed_files: changed_functions |= compare_two_blobs( BlobPositions(changed_file.old_blob, blobs_positions[changed_file.old_blob]), BlobPositions(changed_file.new_blob, blobs_positions[changed_file.new_blob]), paths_f) # if '0000000000000000000000000000000000000000' in changed_file.new_blob or \ # '0000000000000000000000000000000000000000' in changed_file.old_blob: # print(f'Commit: {commit}, #changed functions {len(changed_functions)}') len_changed_functions.append(len(changed_functions)) if len(changed_functions) > 0: message = Message(repo_commit_vs_repo_blobs[commit][0].message) if write_commit_message_and_all_changed_functions( message, changed_functions, 4, out_f): out_l.write(f'{commit}\n')
def messages_vs_author(commits_log: Path) -> Dict[int, Tuple[str, Message]]: message_vs_author_count: Dict[Tuple[str, Message], int] = collections.defaultdict(int) with open(commits_log, 'r') as f: for line in f: commits_log_line: CommitLogLine = CommitLogLine.parse_from_line( line, separator=COMMON_SEP) author = commits_log_line.author message = Message(" ".join( split_commit_message(commits_log_line.message))) message_vs_author_count[(author, message)] += 1 count_vs_pair = invert_dict(message_vs_author_count) return collections.OrderedDict(sorted(count_vs_pair.items(), reverse=True))
def commit_msg_tokenizing_aurora(com_log: Path): msg_vs_counts: Dict[Message, int] = {} msgs: List[Message] = [] with open(com_log, 'r') as f: for line in f: if line.startswith("parent_commit_file_hash"): continue com_line: CommitLogLine = CommitLogLine.parse_from_line(line) message: Message = com_line.message if message == "no message" or \ message == "New version" or \ message == "Build completed" or \ message == "Build failed" or \ message == "*** empty log message ***": continue else: msgs.append(message) message: Message = Message(message.lower()) if message in msg_vs_counts: msg_vs_counts[message] += 1 else: msg_vs_counts[message] = 1 print(f"Unique message number {len(msg_vs_counts)}") counts_vs_msg = invert_dict(msg_vs_counts) counts_vs_msg_sorted = OrderedDict( sorted(counts_vs_msg.items(), reverse=True)) top_popular_msg = dict( itertools.islice(counts_vs_msg_sorted.items(), 0, 20)) for key, value in counts_vs_msg_sorted.items(): print("{:>3}: {} {}".format(key, len(value), value)) tokenizer = Tokenizer() tokenizer.fit_on_texts(msgs) counts_vs_word = invert_dict(tokenizer.word_counts) counts_vs_word_sorted = OrderedDict( sorted(counts_vs_word.items(), reverse=True)) top_popular_words = dict( itertools.islice(counts_vs_word_sorted.items(), 0, 20)) print(top_popular_words)
def commit_msg_tokenizing_camel(com_log: Path, full_log: Path): msg_vs_counts: Dict[Message, int] = {} msgs: List[Message] = [] commits_from_full_log: Set[Commit] = get_commits_from_full_log(full_log) with open(com_log, 'r') as f: for line in f: if line.startswith("parent_commit_file_hash"): continue com_log_line: CommitLogLine = CommitLogLine.parse_from_line(line) if com_log_line.current_commit in commits_from_full_log: message: Message = com_log_line.message msgs.append(message) message: Message = Message(message.lower()) if message in msg_vs_counts: msg_vs_counts[message] += 1 else: msg_vs_counts[message] = 1 print(f"Unique message number {len(msg_vs_counts)}") counts_vs_msg = invert_dict(msg_vs_counts) counts_vs_msg_sorted = OrderedDict( sorted(counts_vs_msg.items(), reverse=True)) top_popular_msg = dict( itertools.islice(counts_vs_msg_sorted.items(), 0, 20)) for key, value in counts_vs_msg_sorted.items(): print("{:>3}: {} {}".format(key, len(value), value)) tokenizer = Tokenizer() tokenizer.fit_on_texts(msgs) counts_vs_word = invert_dict(tokenizer.word_counts) counts_vs_word_sorted = OrderedDict( sorted(counts_vs_word.items(), reverse=True)) top_popular_words = dict( itertools.islice(counts_vs_word_sorted.items(), 0, 30)) print(top_popular_words)
def get_commits_for_train_intellij(com_log: Path, empty_commits_file: Path, is_pickle: bool, commits_for_train_file: Path, new_com_log: Path): needed_commits: Set[Commit] = set() with open(empty_commits_file, 'rb') as file: empty_commits: Set[Commit] = pickle.load(file) with open(new_com_log, 'w') as new_com_log_file, open(com_log, 'r') as com_log_file: total_commit_number: int = 0 for line in com_log_file: if line.startswith("parent_commit_file_hash"): continue commit_log_line: CommitLogLine = CommitLogLine.parse_from_line( line) if commit_log_line.current_commit in empty_commits: continue message = Message(commit_log_line.message.lower()) if message == "(no message)": continue text_list = text_to_word_sequence(message) if text_list: total_commit_number += 1 needed_commits.add(commit_log_line.current_commit) new_com_log_file.write(f"{line}") print(f"Number of needed commits {len(needed_commits)}") if is_pickle: with open(commits_for_train_file, 'wb') as file: pickle.dump(needed_commits, file) else: with open(commits_for_train_file, 'w') as file: for commit in needed_commits: file.write(f"{commit}\n")
def __init__(self, commit_message: str, paths: List[str] = None): self.commit_message: Message = Message(commit_message) self.functions_paths: List[Code2SeqPath] = [ Code2SeqPath(path) for path in paths ]
def replace_target_with_message(data: Path, full_log: Path, train: Path, test: Path, val: Path, splitted_dataset_file: Path, max_functions_count_per_commit: int): print(data) with open(splitted_dataset_file, 'rb') as sdf: splitted_dataset: Dict[DatasetPart, Set[Commit]] = pickle.load(sdf) print(f"train: {len(splitted_dataset[DatasetPart.TRAIN])}, " f"test: {len(splitted_dataset[DatasetPart.TEST])}, " f"val: {len(splitted_dataset[DatasetPart.VAL])}") blobs_positions: Dict[Blob, List[Tuple[int, int]]] = get_blobs_positions(data) commit_vs_blobs: Dict[Commit, List[FullLogLine]] = get_commit_vs_blobs(full_log) i = 0 tr, te, v, contin, z = 0, 0, 0, 0, 0 len_changed_functions: List[int] = [] with open(train, 'w') as train_f, open(test, 'w') as test_f, open( val, 'w') as val_f, open(data, 'rb') as data_f: for commit, changed_files in commit_vs_blobs.items(): i += 1 if i % 100 == 0: print( f"At {i}, mean = {np.mean(len_changed_functions)}, " f"median = {np.median(len_changed_functions)}\n" f"60 percentile = {np.percentile(np.array(len_changed_functions), 60)}\n" f"70 percentile = {np.percentile(np.array(len_changed_functions), 70)}\n" f"80 percentile = {np.percentile(np.array(len_changed_functions), 80)}\n" f"90 percentile = {np.percentile(np.array(len_changed_functions), 90)}" ) if commit in splitted_dataset[DatasetPart.TRAIN]: file = train_f tr += 1 elif commit in splitted_dataset[DatasetPart.TEST]: file = test_f te += 1 elif commit in splitted_dataset[DatasetPart.VAL]: file = val_f v += 1 else: contin += 1 continue changed_functions: Set[Tuple[FunctionInfo, FunctionInfo]] = set() for changed_file in changed_files: changed_functions |= compare_two_blobs( BlobPositions(changed_file.old_blob, blobs_positions[changed_file.old_blob]), BlobPositions(changed_file.new_blob, blobs_positions[changed_file.new_blob]), data_f) if len(changed_functions) > 0: z += 1 len_changed_functions.append(len(changed_functions)) message = Message(commit_vs_blobs[commit][0].message) write_commit_message_and_all_changed_functions( message, changed_functions, max_functions_count_per_commit, file) print( f"50 percentile: {np.percentile(np.array(len_changed_functions), 50)}") print( f"60 percentile: {np.percentile(np.array(len_changed_functions), 60)}") print( f"70 percentile: {np.percentile(np.array(len_changed_functions), 70)}") print( f"80 percentile: {np.percentile(np.array(len_changed_functions), 80)}") print( f"90 percentile: {np.percentile(np.array(len_changed_functions), 90)}") print( f"95 percentile: {np.percentile(np.array(len_changed_functions), 95)}") print(f"train={tr}, test={te}, val={v}, continue {contin}, nnz = {z}") print(f"number of all commits = {len(commit_vs_blobs.keys())}")
def delete_key_words_in_message(message: Message, key_word: str) -> Message: raw_message = ' '.join(message.split('|')) cleaned_message = raw_message.replace(key_word, '') return Message('|'.join(cleaned_message.split(' ')))
def insert_results_in_common_csv(full_log: Path, code2seq: Path, output: Path): processed_commits: Set[Commit] = set() i, k = 0, 0 commits_vs_positions: DefaultDict[Commit, List[Tuple[ int, int]]] = parse_result_file(code2seq) print(f"Finishe parse file {code2seq}") output_line_template = Template( '$commit$sep$file$sep$status$sep' '$original_message$sep$function_name$sep$predicted_message$sep\n') with open(full_log, 'r') as full_log_file, open(output, 'w') as output_file, open( code2seq, 'rb') as code2seq_file: for line in full_log_file: i += 1 if i % 20 == 0: print(f"{i} line in full log") if line.startswith("commit_hash"): continue full_log_line: FullLogLine = FullLogLine.parse_from_line(line) message: Message = full_log_line.message if message.startswith("This commit was manufactured by cvs2svn"): if full_log_line.commit not in processed_commits: output_file.write( output_line_template.substitute( commit=full_log_line.commit, file=full_log_line.file, status=full_log_line.status, original_message=message, function_name="", predicted_message="", sep="^")) processed_commits.add(full_log_line.commit) else: output_file.write( output_line_template.substitute( commit="", file=full_log_line.file, status=full_log_line.status, original_message=message, function_name="", predicted_message="", sep="^")) continue predicted_results: List[ PredictedResults] = PredictedResults.find_results_for_commit_and_blobs( commits_vs_positions[full_log_line.commit], full_log_line.old_blob, full_log_line.new_blob, code2seq_file) if message == "no message" or message == "*** empty log message ***": message = Message(" ") if len(predicted_results) == 0: if full_log_line.commit not in processed_commits: output_file.write( output_line_template.substitute( commit=full_log_line.commit, file=full_log_line.file, status=full_log_line.status, original_message=message, function_name="", predicted_message="", sep="^")) processed_commits.add(full_log_line.commit) else: output_file.write( output_line_template.substitute( commit="", file=full_log_line.file, status=full_log_line.status, original_message=message, function_name="", predicted_message="", sep="^")) else: for prediction in predicted_results: k += 1 if k % 10 == 0: print(f"write {k} generated annotation") if full_log_line.commit not in processed_commits: output_file.write( output_line_template.substitute( commit=full_log_line.commit, file=full_log_line.file, status=full_log_line.status, original_message=message, function_name=prediction.function_name, predicted_message=prediction.predicted_message, sep="^")) else: output_file.write( output_line_template.substitute( commit="", file=full_log_line.file, status=full_log_line.status, original_message=message, function_name=prediction.function_name, predicted_message=prediction.predicted_message, sep="^")) processed_commits.add(full_log_line.commit)
def parse_dataset_line(line: str) -> (Message, List[Code2SeqPath]): [message, *function_body] = line.split(" ") function_body: List[Code2SeqPath] = [Code2SeqPath(path) for path in function_body] function_body[-1] = function_body[-1].strip() return Message(message), function_body