def test_all_blobs_was_extracted():
    parent_dir: Path = Path('../../../new_data/')
    common_log_changed_files: Path = parent_dir.joinpath('common_blobs.log')
    path_file: Path = parent_dir.joinpath(
        'processed_data/c2s_paths/top1000_dataset_v2.train.raw.txt')
    repo_commit_vs_repo_blobs = get_repo_commit_vs_repo_blobs(
        common_log_changed_files, sep=COMMON_SEP)
    blobs_positions: Dict[Blob,
                          List[Tuple[int,
                                     int]]] = get_blobs_positions(path_file)
    print('Finished parsing path file')

    failed_count: int = 0
    total_count = 0
    failed_blobs = parent_dir.joinpath('new_failed_blob.log')
    with open(failed_blobs, 'w') as ff:
        for commit, changed_files in repo_commit_vs_repo_blobs.items():
            for changed_file in changed_files:
                total_count += 2
                if changed_file.old_blob not in blobs_positions:
                    failed_count += 1
                    ff.write(f'{changed_file.old_blob}\n')
                if changed_file.new_blob not in blobs_positions:
                    failed_count += 1
                    ff.write(f'{changed_file.new_blob}\n')
    print(f'Total count {total_count} , count of fails: {failed_count}')
Esempio n. 2
0
def get_commits_with_exact_number_of_changed_functions(
        data: Path,
        full_log: Path,
        output: Path,
        is_filtered: bool = False,
        filtered_json: Path = None):
    blobs_positions: Dict[Blob, List[Tuple[int,
                                           int]]] = get_blobs_positions(data)
    commit_vs_blobs: Dict[Commit, List[FullLogLine]] = get_commit_vs_blobs(
        full_log, sep=COMMON_SEP)
    if is_filtered:
        with open(filtered_json, 'r') as f:
            filtered_commits_json = json.load(f)
        filtered_commits_messages: Dict[Commit, Message] = {}
        filtered_commits: Set[Commit] = set()
        for raw_commit in filtered_commits_json:
            parsed_commit: CommitDiff = CommitDiff.from_dict(raw_commit)
            filtered_commits_messages[
                parsed_commit.commit] = parsed_commit.message
            filtered_commits.add(parsed_commit.commit)
    print("finished parsing")

    i = 0
    with open(data, 'rb') as data_f:
        for commit, changed_files in commit_vs_blobs.items():
            if is_filtered:
                if commit not in filtered_commits:
                    continue
            i += 1
            if i % 100 == 0:
                print(f"At {i}")

            changed_functions: Set[Tuple[FunctionInfo, FunctionInfo]] = set()
            for changed_file in changed_files:
                changed_functions |= compare_two_blobs(
                    BlobPositions(changed_file.old_blob,
                                  blobs_positions[changed_file.old_blob]),
                    BlobPositions(changed_file.new_blob,
                                  blobs_positions[changed_file.new_blob]),
                    data_f)
            changed_functions_number: int = len(changed_functions)
            # print(changed_functions_number)

            if 1 <= changed_functions_number <= 4:
                # print('here')
                output_file: Path = output / f'{changed_functions_number}.txt'
                message = filtered_commits_messages[commit]
                with open(output_file, 'a+') as file:
                    write_commit_message_and_all_changed_functions(
                        message, changed_functions, changed_functions_number,
                        file)
                common_file: Path = output / '1234.txt'
                with open(common_file, 'a+') as file:
                    write_commit_message_and_all_changed_functions(
                        message, changed_functions, 4, file)
def replace_target_with_message(paths_file, common_log_file, output_dir):
    print('Start parsing paths file')
    a = time()
    blobs_positions: Dict[Blob,
                          List[Tuple[int,
                                     int]]] = get_blobs_positions(paths_file)
    b = time()
    print('Finished parsing paths file')
    print(f'Number of blobs is {len(blobs_positions)}, Time {b - a}')
    repo_commit_vs_repo_blobs: Dict[
        Commit,
        List[FullLogLine]] = get_repo_commit_vs_repo_blobs(common_log_file,
                                                           sep=COMMON_SEP)
    output_file: Path = output_dir.joinpath('full_dataset.txt')
    output_log: Path = output_dir.joinpath('c2s_commits.log')
    len_changed_functions: List[int] = []
    i = 0
    with open(output_file,
              'w') as out_f, open(output_log,
                                  'w') as out_l, open(paths_file,
                                                      'rb') as paths_f:
        for commit, changed_files in tqdm(repo_commit_vs_repo_blobs.items()):
            i += 1
            if i % 1000 == 0:
                print(
                    f"mean = {np.mean(len_changed_functions)}, "
                    f"median = {np.median(len_changed_functions)}\n"
                    f"60 percentile = {np.percentile(np.array(len_changed_functions), 60)}\n"
                    f"70 percentile = {np.percentile(np.array(len_changed_functions), 70)}\n"
                    f"80 percentile = {np.percentile(np.array(len_changed_functions), 80)}\n"
                    f"90 percentile = {np.percentile(np.array(len_changed_functions), 90)}"
                )
            changed_functions: Set[Tuple[FunctionInfo, FunctionInfo]] = set()
            for changed_file in changed_files:
                changed_functions |= compare_two_blobs(
                    BlobPositions(changed_file.old_blob,
                                  blobs_positions[changed_file.old_blob]),
                    BlobPositions(changed_file.new_blob,
                                  blobs_positions[changed_file.new_blob]),
                    paths_f)
                # if '0000000000000000000000000000000000000000' in changed_file.new_blob or \
                #         '0000000000000000000000000000000000000000' in changed_file.old_blob:
                #     print(f'Commit: {commit}, #changed functions {len(changed_functions)}')

            len_changed_functions.append(len(changed_functions))

            if len(changed_functions) > 0:
                message = Message(repo_commit_vs_repo_blobs[commit][0].message)
                if write_commit_message_and_all_changed_functions(
                        message, changed_functions, 4, out_f):
                    out_l.write(f'{commit}\n')
def remove_method_name_with_meta_info(
        data: Path, blobs_history: Mapping[Blob, List[NextBlobMetaInfo]],
        output: Path):
    blobs_positions: DefaultDict[Blob,
                                 List[Tuple[int,
                                            int]]] = get_blobs_positions(data)

    with open(output, 'w') as output_file, open(data, 'rb') as data_file:
        for blob, next_blobs in blobs_history.items():
            for commit, next_blob, _ in next_blobs:
                changed_functions = compare_two_blobs(
                    BlobPositions(blob, blobs_positions[blob]),
                    BlobPositions(next_blob, blobs_positions[next_blob]),
                    data_file)
                write_meta_info_and_path_diff(commit, changed_functions,
                                              output_file)
Esempio n. 5
0
def replace_target_with_message(data: Path, full_log: Path, train: Path,
                                test: Path, val: Path,
                                splitted_dataset_file: Path,
                                max_functions_count_per_commit: int):
    print(data)
    with open(splitted_dataset_file, 'rb') as sdf:
        splitted_dataset: Dict[DatasetPart, Set[Commit]] = pickle.load(sdf)
    print(f"train: {len(splitted_dataset[DatasetPart.TRAIN])}, "
          f"test: {len(splitted_dataset[DatasetPart.TEST])}, "
          f"val: {len(splitted_dataset[DatasetPart.VAL])}")

    blobs_positions: Dict[Blob, List[Tuple[int,
                                           int]]] = get_blobs_positions(data)

    commit_vs_blobs: Dict[Commit,
                          List[FullLogLine]] = get_commit_vs_blobs(full_log)

    i = 0
    tr, te, v, contin, z = 0, 0, 0, 0, 0
    len_changed_functions: List[int] = []
    with open(train, 'w') as train_f, open(test, 'w') as test_f, open(
            val, 'w') as val_f, open(data, 'rb') as data_f:
        for commit, changed_files in commit_vs_blobs.items():
            i += 1
            if i % 100 == 0:
                print(
                    f"At {i}, mean = {np.mean(len_changed_functions)}, "
                    f"median = {np.median(len_changed_functions)}\n"
                    f"60 percentile = {np.percentile(np.array(len_changed_functions), 60)}\n"
                    f"70 percentile = {np.percentile(np.array(len_changed_functions), 70)}\n"
                    f"80 percentile = {np.percentile(np.array(len_changed_functions), 80)}\n"
                    f"90 percentile = {np.percentile(np.array(len_changed_functions), 90)}"
                )

            if commit in splitted_dataset[DatasetPart.TRAIN]:
                file = train_f
                tr += 1
            elif commit in splitted_dataset[DatasetPart.TEST]:
                file = test_f
                te += 1
            elif commit in splitted_dataset[DatasetPart.VAL]:
                file = val_f
                v += 1
            else:
                contin += 1
                continue

            changed_functions: Set[Tuple[FunctionInfo, FunctionInfo]] = set()
            for changed_file in changed_files:
                changed_functions |= compare_two_blobs(
                    BlobPositions(changed_file.old_blob,
                                  blobs_positions[changed_file.old_blob]),
                    BlobPositions(changed_file.new_blob,
                                  blobs_positions[changed_file.new_blob]),
                    data_f)
            if len(changed_functions) > 0:
                z += 1
                len_changed_functions.append(len(changed_functions))

                message = Message(commit_vs_blobs[commit][0].message)
                write_commit_message_and_all_changed_functions(
                    message, changed_functions, max_functions_count_per_commit,
                    file)
    print(
        f"50 percentile: {np.percentile(np.array(len_changed_functions), 50)}")
    print(
        f"60 percentile: {np.percentile(np.array(len_changed_functions), 60)}")
    print(
        f"70 percentile: {np.percentile(np.array(len_changed_functions), 70)}")
    print(
        f"80 percentile: {np.percentile(np.array(len_changed_functions), 80)}")
    print(
        f"90 percentile: {np.percentile(np.array(len_changed_functions), 90)}")
    print(
        f"95 percentile: {np.percentile(np.array(len_changed_functions), 95)}")
    print(f"train={tr}, test={te}, val={v}, continue {contin}, nnz = {z}")
    print(f"number of all commits = {len(commit_vs_blobs.keys())}")