Exemple #1
0
def get_data(ref_file: Path,
             pred_file: Path) -> (List[Message], List[Message]):
    all_ref: List[Message] = []
    with open(ref_file, 'r') as r:
        for line in r:
            all_ref.append(Message(line))

    all_pred: List[Message] = []
    with open(pred_file, 'r') as pr:
        for line in pr:
            all_pred.append(Message(line))

    return all_ref, all_pred
Exemple #2
0
def get_diffs(changed_files_log: Path, output: Path, context_size: int,
              git_dir: Path):
    commits_diffs: List[CommitDiff] = []
    commit_vs_blobs: Dict[Commit, List[FullLogLine]] = get_commit_vs_blobs(
        changed_files_log, sep=COMMON_SEP)
    print(len(commit_vs_blobs.keys()))
    i = 0
    for commit, changed_files in tqdm(commit_vs_blobs.items()):
        i += 1
        # if i % 1000 == 0:
        #     print(f"At {i}")
        # if i > 100:
        #     break
        message = Message(changed_files[0].message)
        author = changed_files[0].author
        files_diffs = get_all_diffs_per_commit(changed_files, context_size,
                                               git_dir)
        for file_diff in files_diffs:
            file_diff.delete_useless_git_diff_output()
            file_diff.tokenize_each_line_of_diff_body()
        commits_diffs.append(
            CommitDiff(commit=commit,
                       message=message,
                       changed_java_files=files_diffs))
    with open(output, 'w', encoding='utf-8') as output_f:
        output_f.write(
            json.dumps(commits_diffs, default=CommitDiff.to_json, indent=2))
Exemple #3
0
def messages_frequency(messages: List[Message]) -> Dict[int, Message]:
    message_vs_count: Dict[Message, int] = collections.defaultdict(int)

    for message in messages:
        message_vs_count[Message(" ".join(split_commit_message(message)))] += 1

    count_vs_msg = invert_dict(message_vs_count)
    return collections.OrderedDict(sorted(count_vs_msg.items(), reverse=True))
Exemple #4
0
def get_commits_for_train_aurora(com_log: Path, empty_commits_file: Path,
                                 is_pickle: bool, commits_for_train_file: Path,
                                 new_com_log: Path):
    needed_commits: Set[Commit] = set()
    with open(empty_commits_file, 'rb') as file:
        empty_commits: Set[Commit] = pickle.load(file)

    with open(new_com_log, 'w') as new_log_file, open(com_log,
                                                      'r') as old_log_file:
        total_commit_number: int = 0

        for line in old_log_file:
            if line.startswith("parent_commit_file_hash"):
                continue

            commit_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line)
            if commit_log_line.current_commit in empty_commits:
                continue

            message = Message(commit_log_line.message.lower())
            if commit_log_line.author == "builder":
                if message == "new version" or \
                        message == "build completed" or \
                        message == "build failed":
                    continue
            if message == "no message" or \
                    message == "*** empty log message ***" or \
                    message.startswith("this commit was manufactured by cvs2svn"):
                continue
            text_list = text_to_word_sequence(message)
            if text_list:
                total_commit_number += 1
                needed_commits.add(commit_log_line.current_commit)
            new_log_file.write(f"{line}")

    print(f"Number of needed commits {len(needed_commits)}")

    if is_pickle:
        with open(commits_for_train_file, 'wb') as file:
            pickle.dump(needed_commits, file)
    else:
        with open(commits_for_train_file, 'w') as file:
            for commit in needed_commits:
                file.write(f"{commit}\n")
Exemple #5
0
 def from_dict(input_) -> 'CommitDiff':
     if 'diff_body_common' in input_['changed_java_files'][0]:
         return CommitDiff(
             commit=Commit(input_['commit']),
             message=Message(input_['message']),
             # author=input_['author'],
             changed_java_files=[
                 FileDiffWithTwoInput.from_dict(file)
                 for file in input_['changed_java_files']
             ],
             # is_there_dobj=input_['is_there_dobj'])
             is_there_dobj=True)
     else:
         return CommitDiff(
             commit=Commit(input_['commit']),
             message=Message(input_['message']),
             # author=input_['author'],
             changed_java_files=[
                 FileDiff.from_dict(file)
                 for file in input_['changed_java_files']
             ],
             is_there_dobj=input_['is_there_dobj'])
def replace_target_with_message(paths_file, common_log_file, output_dir):
    print('Start parsing paths file')
    a = time()
    blobs_positions: Dict[Blob,
                          List[Tuple[int,
                                     int]]] = get_blobs_positions(paths_file)
    b = time()
    print('Finished parsing paths file')
    print(f'Number of blobs is {len(blobs_positions)}, Time {b - a}')
    repo_commit_vs_repo_blobs: Dict[
        Commit,
        List[FullLogLine]] = get_repo_commit_vs_repo_blobs(common_log_file,
                                                           sep=COMMON_SEP)
    output_file: Path = output_dir.joinpath('full_dataset.txt')
    output_log: Path = output_dir.joinpath('c2s_commits.log')
    len_changed_functions: List[int] = []
    i = 0
    with open(output_file,
              'w') as out_f, open(output_log,
                                  'w') as out_l, open(paths_file,
                                                      'rb') as paths_f:
        for commit, changed_files in tqdm(repo_commit_vs_repo_blobs.items()):
            i += 1
            if i % 1000 == 0:
                print(
                    f"mean = {np.mean(len_changed_functions)}, "
                    f"median = {np.median(len_changed_functions)}\n"
                    f"60 percentile = {np.percentile(np.array(len_changed_functions), 60)}\n"
                    f"70 percentile = {np.percentile(np.array(len_changed_functions), 70)}\n"
                    f"80 percentile = {np.percentile(np.array(len_changed_functions), 80)}\n"
                    f"90 percentile = {np.percentile(np.array(len_changed_functions), 90)}"
                )
            changed_functions: Set[Tuple[FunctionInfo, FunctionInfo]] = set()
            for changed_file in changed_files:
                changed_functions |= compare_two_blobs(
                    BlobPositions(changed_file.old_blob,
                                  blobs_positions[changed_file.old_blob]),
                    BlobPositions(changed_file.new_blob,
                                  blobs_positions[changed_file.new_blob]),
                    paths_f)
                # if '0000000000000000000000000000000000000000' in changed_file.new_blob or \
                #         '0000000000000000000000000000000000000000' in changed_file.old_blob:
                #     print(f'Commit: {commit}, #changed functions {len(changed_functions)}')

            len_changed_functions.append(len(changed_functions))

            if len(changed_functions) > 0:
                message = Message(repo_commit_vs_repo_blobs[commit][0].message)
                if write_commit_message_and_all_changed_functions(
                        message, changed_functions, 4, out_f):
                    out_l.write(f'{commit}\n')
Exemple #7
0
def messages_vs_author(commits_log: Path) -> Dict[int, Tuple[str, Message]]:
    message_vs_author_count: Dict[Tuple[str, Message],
                                  int] = collections.defaultdict(int)

    with open(commits_log, 'r') as f:
        for line in f:
            commits_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line, separator=COMMON_SEP)
            author = commits_log_line.author
            message = Message(" ".join(
                split_commit_message(commits_log_line.message)))
            message_vs_author_count[(author, message)] += 1

    count_vs_pair = invert_dict(message_vs_author_count)
    return collections.OrderedDict(sorted(count_vs_pair.items(), reverse=True))
def commit_msg_tokenizing_aurora(com_log: Path):
    msg_vs_counts: Dict[Message, int] = {}
    msgs: List[Message] = []

    with open(com_log, 'r') as f:
        for line in f:
            if line.startswith("parent_commit_file_hash"):
                continue

            com_line: CommitLogLine = CommitLogLine.parse_from_line(line)
            message: Message = com_line.message
            if message == "no message" or \
                    message == "New version" or \
                    message == "Build completed" or \
                    message == "Build failed" or \
                    message == "*** empty log message ***":
                continue
            else:
                msgs.append(message)
                message: Message = Message(message.lower())
                if message in msg_vs_counts:
                    msg_vs_counts[message] += 1
                else:
                    msg_vs_counts[message] = 1

    print(f"Unique message number {len(msg_vs_counts)}")

    counts_vs_msg = invert_dict(msg_vs_counts)
    counts_vs_msg_sorted = OrderedDict(
        sorted(counts_vs_msg.items(), reverse=True))
    top_popular_msg = dict(
        itertools.islice(counts_vs_msg_sorted.items(), 0, 20))

    for key, value in counts_vs_msg_sorted.items():
        print("{:>3}: {} {}".format(key, len(value), value))

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(msgs)

    counts_vs_word = invert_dict(tokenizer.word_counts)
    counts_vs_word_sorted = OrderedDict(
        sorted(counts_vs_word.items(), reverse=True))
    top_popular_words = dict(
        itertools.islice(counts_vs_word_sorted.items(), 0, 20))
    print(top_popular_words)
def commit_msg_tokenizing_camel(com_log: Path, full_log: Path):
    msg_vs_counts: Dict[Message, int] = {}
    msgs: List[Message] = []

    commits_from_full_log: Set[Commit] = get_commits_from_full_log(full_log)

    with open(com_log, 'r') as f:
        for line in f:
            if line.startswith("parent_commit_file_hash"):
                continue

            com_log_line: CommitLogLine = CommitLogLine.parse_from_line(line)
            if com_log_line.current_commit in commits_from_full_log:
                message: Message = com_log_line.message
                msgs.append(message)
                message: Message = Message(message.lower())
                if message in msg_vs_counts:
                    msg_vs_counts[message] += 1
                else:
                    msg_vs_counts[message] = 1

    print(f"Unique message number {len(msg_vs_counts)}")

    counts_vs_msg = invert_dict(msg_vs_counts)
    counts_vs_msg_sorted = OrderedDict(
        sorted(counts_vs_msg.items(), reverse=True))
    top_popular_msg = dict(
        itertools.islice(counts_vs_msg_sorted.items(), 0, 20))

    for key, value in counts_vs_msg_sorted.items():
        print("{:>3}: {} {}".format(key, len(value), value))

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(msgs)

    counts_vs_word = invert_dict(tokenizer.word_counts)
    counts_vs_word_sorted = OrderedDict(
        sorted(counts_vs_word.items(), reverse=True))
    top_popular_words = dict(
        itertools.islice(counts_vs_word_sorted.items(), 0, 30))
    print(top_popular_words)
Exemple #10
0
def get_commits_for_train_intellij(com_log: Path, empty_commits_file: Path,
                                   is_pickle: bool,
                                   commits_for_train_file: Path,
                                   new_com_log: Path):
    needed_commits: Set[Commit] = set()
    with open(empty_commits_file, 'rb') as file:
        empty_commits: Set[Commit] = pickle.load(file)

    with open(new_com_log, 'w') as new_com_log_file, open(com_log,
                                                          'r') as com_log_file:
        total_commit_number: int = 0
        for line in com_log_file:
            if line.startswith("parent_commit_file_hash"):
                continue

            commit_log_line: CommitLogLine = CommitLogLine.parse_from_line(
                line)

            if commit_log_line.current_commit in empty_commits:
                continue
            message = Message(commit_log_line.message.lower())
            if message == "(no message)":
                continue
            text_list = text_to_word_sequence(message)
            if text_list:
                total_commit_number += 1
                needed_commits.add(commit_log_line.current_commit)
            new_com_log_file.write(f"{line}")

    print(f"Number of needed commits {len(needed_commits)}")

    if is_pickle:
        with open(commits_for_train_file, 'wb') as file:
            pickle.dump(needed_commits, file)
    else:
        with open(commits_for_train_file, 'w') as file:
            for commit in needed_commits:
                file.write(f"{commit}\n")
Exemple #11
0
 def __init__(self, commit_message: str, paths: List[str] = None):
     self.commit_message: Message = Message(commit_message)
     self.functions_paths: List[Code2SeqPath] = [
         Code2SeqPath(path) for path in paths
     ]
Exemple #12
0
def replace_target_with_message(data: Path, full_log: Path, train: Path,
                                test: Path, val: Path,
                                splitted_dataset_file: Path,
                                max_functions_count_per_commit: int):
    print(data)
    with open(splitted_dataset_file, 'rb') as sdf:
        splitted_dataset: Dict[DatasetPart, Set[Commit]] = pickle.load(sdf)
    print(f"train: {len(splitted_dataset[DatasetPart.TRAIN])}, "
          f"test: {len(splitted_dataset[DatasetPart.TEST])}, "
          f"val: {len(splitted_dataset[DatasetPart.VAL])}")

    blobs_positions: Dict[Blob, List[Tuple[int,
                                           int]]] = get_blobs_positions(data)

    commit_vs_blobs: Dict[Commit,
                          List[FullLogLine]] = get_commit_vs_blobs(full_log)

    i = 0
    tr, te, v, contin, z = 0, 0, 0, 0, 0
    len_changed_functions: List[int] = []
    with open(train, 'w') as train_f, open(test, 'w') as test_f, open(
            val, 'w') as val_f, open(data, 'rb') as data_f:
        for commit, changed_files in commit_vs_blobs.items():
            i += 1
            if i % 100 == 0:
                print(
                    f"At {i}, mean = {np.mean(len_changed_functions)}, "
                    f"median = {np.median(len_changed_functions)}\n"
                    f"60 percentile = {np.percentile(np.array(len_changed_functions), 60)}\n"
                    f"70 percentile = {np.percentile(np.array(len_changed_functions), 70)}\n"
                    f"80 percentile = {np.percentile(np.array(len_changed_functions), 80)}\n"
                    f"90 percentile = {np.percentile(np.array(len_changed_functions), 90)}"
                )

            if commit in splitted_dataset[DatasetPart.TRAIN]:
                file = train_f
                tr += 1
            elif commit in splitted_dataset[DatasetPart.TEST]:
                file = test_f
                te += 1
            elif commit in splitted_dataset[DatasetPart.VAL]:
                file = val_f
                v += 1
            else:
                contin += 1
                continue

            changed_functions: Set[Tuple[FunctionInfo, FunctionInfo]] = set()
            for changed_file in changed_files:
                changed_functions |= compare_two_blobs(
                    BlobPositions(changed_file.old_blob,
                                  blobs_positions[changed_file.old_blob]),
                    BlobPositions(changed_file.new_blob,
                                  blobs_positions[changed_file.new_blob]),
                    data_f)
            if len(changed_functions) > 0:
                z += 1
                len_changed_functions.append(len(changed_functions))

                message = Message(commit_vs_blobs[commit][0].message)
                write_commit_message_and_all_changed_functions(
                    message, changed_functions, max_functions_count_per_commit,
                    file)
    print(
        f"50 percentile: {np.percentile(np.array(len_changed_functions), 50)}")
    print(
        f"60 percentile: {np.percentile(np.array(len_changed_functions), 60)}")
    print(
        f"70 percentile: {np.percentile(np.array(len_changed_functions), 70)}")
    print(
        f"80 percentile: {np.percentile(np.array(len_changed_functions), 80)}")
    print(
        f"90 percentile: {np.percentile(np.array(len_changed_functions), 90)}")
    print(
        f"95 percentile: {np.percentile(np.array(len_changed_functions), 95)}")
    print(f"train={tr}, test={te}, val={v}, continue {contin}, nnz = {z}")
    print(f"number of all commits = {len(commit_vs_blobs.keys())}")
Exemple #13
0
 def delete_key_words_in_message(message: Message,
                                 key_word: str) -> Message:
     raw_message = ' '.join(message.split('|'))
     cleaned_message = raw_message.replace(key_word, '')
     return Message('|'.join(cleaned_message.split(' ')))
Exemple #14
0
def insert_results_in_common_csv(full_log: Path, code2seq: Path, output: Path):
    processed_commits: Set[Commit] = set()
    i, k = 0, 0

    commits_vs_positions: DefaultDict[Commit, List[Tuple[
        int, int]]] = parse_result_file(code2seq)
    print(f"Finishe parse file {code2seq}")
    output_line_template = Template(
        '$commit$sep$file$sep$status$sep'
        '$original_message$sep$function_name$sep$predicted_message$sep\n')

    with open(full_log,
              'r') as full_log_file, open(output, 'w') as output_file, open(
                  code2seq, 'rb') as code2seq_file:
        for line in full_log_file:
            i += 1
            if i % 20 == 0:
                print(f"{i} line in full log")
            if line.startswith("commit_hash"):
                continue

            full_log_line: FullLogLine = FullLogLine.parse_from_line(line)
            message: Message = full_log_line.message

            if message.startswith("This commit was manufactured by cvs2svn"):
                if full_log_line.commit not in processed_commits:
                    output_file.write(
                        output_line_template.substitute(
                            commit=full_log_line.commit,
                            file=full_log_line.file,
                            status=full_log_line.status,
                            original_message=message,
                            function_name="",
                            predicted_message="",
                            sep="^"))
                    processed_commits.add(full_log_line.commit)
                else:
                    output_file.write(
                        output_line_template.substitute(
                            commit="",
                            file=full_log_line.file,
                            status=full_log_line.status,
                            original_message=message,
                            function_name="",
                            predicted_message="",
                            sep="^"))
                continue

            predicted_results: List[
                PredictedResults] = PredictedResults.find_results_for_commit_and_blobs(
                    commits_vs_positions[full_log_line.commit],
                    full_log_line.old_blob, full_log_line.new_blob,
                    code2seq_file)

            if message == "no message" or message == "*** empty log message ***":
                message = Message(" ")
            if len(predicted_results) == 0:
                if full_log_line.commit not in processed_commits:
                    output_file.write(
                        output_line_template.substitute(
                            commit=full_log_line.commit,
                            file=full_log_line.file,
                            status=full_log_line.status,
                            original_message=message,
                            function_name="",
                            predicted_message="",
                            sep="^"))
                    processed_commits.add(full_log_line.commit)
                else:
                    output_file.write(
                        output_line_template.substitute(
                            commit="",
                            file=full_log_line.file,
                            status=full_log_line.status,
                            original_message=message,
                            function_name="",
                            predicted_message="",
                            sep="^"))
            else:
                for prediction in predicted_results:
                    k += 1
                    if k % 10 == 0:
                        print(f"write {k} generated annotation")
                    if full_log_line.commit not in processed_commits:
                        output_file.write(
                            output_line_template.substitute(
                                commit=full_log_line.commit,
                                file=full_log_line.file,
                                status=full_log_line.status,
                                original_message=message,
                                function_name=prediction.function_name,
                                predicted_message=prediction.predicted_message,
                                sep="^"))
                    else:
                        output_file.write(
                            output_line_template.substitute(
                                commit="",
                                file=full_log_line.file,
                                status=full_log_line.status,
                                original_message=message,
                                function_name=prediction.function_name,
                                predicted_message=prediction.predicted_message,
                                sep="^"))
                    processed_commits.add(full_log_line.commit)
Exemple #15
0
def parse_dataset_line(line: str) -> (Message, List[Code2SeqPath]):
    [message, *function_body] = line.split(" ")
    function_body: List[Code2SeqPath] = [Code2SeqPath(path) for path in function_body]
    function_body[-1] = function_body[-1].strip()

    return Message(message), function_body