Example #1
0
def get_downloaded_blobs_stat_collisions(blobs_dir: Path,
                                         common_blobs_file: Path) -> None:
    repo_vs_full_log_lines: Mapping[
        str, List[FullLogLine]] = collections.defaultdict(list)
    full_log_line_list = []
    total_blobs_count: int = 0
    with open(common_blobs_file, 'r') as common_blobs_f:
        for line in common_blobs_f:
            total_blobs_count += 2
            [repo_name, *full_line] = line.split(COMMON_SEP)
            full_log_line_list.append(COMMON_SEP.join(full_line))
            repo_vs_full_log_lines[repo_name].append(
                FullLogLine.parse_from_line(COMMON_SEP.join(full_line),
                                            COMMON_SEP))

    common_blobs = set()
    sorted = collections.Counter(full_log_line_list)
    for line, count in sorted.items():
        if count > 1:
            # print(f'{count} - {line}')
            fll = FullLogLine.parse_from_line(line, COMMON_SEP)
            common_blobs.add(fll.old_blob)
            common_blobs.add(fll.new_blob)

    print(f'count of common blobs is {len(common_blobs)}')

    all_blobs_hashes = set()
    blobs_hashes_collisions = set()
    repo_vs_blobs_hashes = collections.defaultdict(set)
    for repo_name in tqdm(repo_vs_full_log_lines.keys()):
        for changed_file in repo_vs_full_log_lines[repo_name]:
            if changed_file.old_blob != '0000000000000000000000000000000000000000':
                repo_vs_blobs_hashes[repo_name].add(changed_file.old_blob)
            if changed_file.new_blob != '0000000000000000000000000000000000000000':
                repo_vs_blobs_hashes[repo_name].add(changed_file.new_blob)

        # check there is no collision for this repo
        possible_collisions = all_blobs_hashes & repo_vs_blobs_hashes[repo_name]
        if len(possible_collisions) != 0:
            blobs_hashes_collisions.update(possible_collisions)
        all_blobs_hashes.update(repo_vs_blobs_hashes[repo_name])

    print(f'Collisions number {len(blobs_hashes_collisions)}')
    print(blobs_hashes_collisions - common_blobs)

    print(f'Intersection = {len(blobs_hashes_collisions & common_blobs)}')
    print(list(blobs_hashes_collisions)[:10])
    print(f'all blobs hashes {len(all_blobs_hashes)}')
    # with open(common_blobs_file.parent.joinpath('blobs_collisions.pickle'), 'wb') as f:
    #     pickle.dump(blobs_hashes_collisions, f)
    all_downloaded_files = os.listdir(blobs_dir)
    blob_hash_len_with_java, len_java = len(
        'e6a92d4554046b88461778b99dcce88040aeb6a7.java'), len('.java')
    for downloaded_file in all_downloaded_files:
        blob_hash = downloaded_file[-blob_hash_len_with_java:-len_java]
        if blob_hash in common_blobs:
            print(downloaded_file)
Example #2
0
def download_blobs_from_filtered_commits(changed_files_log: Path, lines_range: Tuple[int, int],
                                         filtered_commits_json: Path, blobs_dir: Path, git_dir: Path) -> None:
    start_line = lines_range[0]
    end_line = lines_range[1]
    print(f"start line: {start_line}, end_line: {end_line}")
    with open(filtered_commits_json, 'r') as f:
        commits = json.load(f)
    commits: List[CommitDiff] = [CommitDiff.from_dict(commit) for commit in commits]
    filtered_commits: Set[Commit] = {commit.commit for commit in commits}
    print(f'Number of filtered commits is {len(filtered_commits)}')
    with open(changed_files_log, 'r') as full_log_file:
        i = 0
        for i in range(start_line):
            full_log_file.readline()
            i += 1
        for line in full_log_file:
            i += 1
            if start_line <= i and i <= end_line:
                if i % 20 == 0:
                    print(f"Start to process {i} file; ({start_line}, {end_line})")
                full_log_line: FullLogLine = FullLogLine.parse_from_line(line, COMMON_SEP)
                if full_log_line.commit not in filtered_commits:
                    continue
                all_blobs = os.listdir(blobs_dir)
                if full_log_line.old_blob not in all_blobs:
                    download_blob_content(full_log_line.old_blob, blobs_dir, git_dir)
                if full_log_line.new_blob not in all_blobs:
                    download_blob_content(full_log_line.new_blob, blobs_dir, git_dir)
Example #3
0
def download_blobs_parallel(common_blobs_file: Path) -> None:
    repo_vs_full_log_lines: Dict[
        str, List[FullLogLine]] = collections.defaultdict(list)
    total_blobs_count: int = 0
    with open(common_blobs_file, 'r') as common_blobs_f:
        for line in common_blobs_f:
            total_blobs_count += 2
            [repo_name, *full_line] = line.split(COMMON_SEP)
            repo_vs_full_log_lines[repo_name].append(
                FullLogLine.parse_from_line(COMMON_SEP.join(full_line),
                                            COMMON_SEP))
    print(f'Total blobs count = {total_blobs_count}')
    print(
        f'We needed to download data for {len(repo_vs_full_log_lines)} repos, but we already have one part'
    )
    blobs_dir = Path('../../../new_data/raw_data/blobs_200')
    repos_not_to_download = []
    downloaded_blobs_files = set(os.listdir(blobs_dir))
    for repo in tqdm(repo_vs_full_log_lines.keys()):
        if not do_we_need_to_clone_repo(repo_vs_full_log_lines[repo], repo,
                                        blobs_dir, downloaded_blobs_files):
            repos_not_to_download.append(repo)

    for del_repos in repos_not_to_download:
        del repo_vs_full_log_lines[del_repos]

    print(len(repo_vs_full_log_lines['aosp-mirror_platform_frameworks_base']))
    print(f'So now we need to download {len(repo_vs_full_log_lines)} repos')

    repos = repo_vs_full_log_lines.keys()
    repos = [(repo, downloaded_blobs_files) for repo in repos]
    for _ in tqdm(multiprocessing.Pool().imap_unordered(
            process_one_dir, repos),
                  total=len(repos)):
        pass
Example #4
0
def process_one_dir(repo_name_input):
    repo_name_input, download_blobs_files = repo_name_input
    blobs_dir: Path = Path('../../../new_data/raw_data/blobs_200')
    filtered_diff_dir: Path = Path(
        '../../../new_data/processed_data/two_inputs_200')
    dir_for_repos = filtered_diff_dir.parent
    common_blobs_file: Path = filtered_diff_dir.parent.joinpath(
        'common_blobs_200.log')
    full_log_lines: List[FullLogLine] = []
    with open(common_blobs_file, 'r') as common_blobs_f:
        for line in common_blobs_f:
            [repo_name, *full_line] = line.split(COMMON_SEP)
            if repo_name == repo_name_input:
                full_log_lines.append(
                    FullLogLine.parse_from_line(COMMON_SEP.join(full_line),
                                                COMMON_SEP))

    repo_name_for_clone = repo_name_input.replace('_', '/')
    if repo_name_for_clone == 'aosp-mirror/platform/frameworks/base':
        repo_name_for_clone = 'aosp-mirror/platform_frameworks_base'
    if full_log_lines:
        with LocalGitRepository(repo_name_for_clone, dir_for_repos) as git_dir:
            for log_line in full_log_lines:
                if not is_blob_downloaded(log_line.old_blob, repo_name_input,
                                          blobs_dir, download_blobs_files):
                    download_blob_content_other_output_file_name(
                        log_line.old_blob, repo_name_input, blobs_dir, git_dir)
                if not is_blob_downloaded(log_line.new_blob, repo_name_input,
                                          blobs_dir, download_blobs_files):
                    download_blob_content_other_output_file_name(
                        log_line.new_blob, repo_name_input, blobs_dir, git_dir)
Example #5
0
def get_commits_from_full_log(full_log: Path) -> Set[Commit]:
    result: Set[Commit] = set()

    with open(full_log, 'r') as f:
        for line in f:
            full_log_line: FullLogLine = FullLogLine.parse_from_line(line)
            result.add(full_log_line.commit)

    return result
Example #6
0
def get_commit_vs_blobs(
        full_log: Path,
        sep: str = SEPARATOR) -> Dict[Commit, List[FullLogLine]]:
    commit_vs_blobs: DefaultDict[
        Commit, List[FullLogLine]] = collections.defaultdict(list)
    with open(full_log, 'r', encoding='utf-8') as full_log_file:
        for line in full_log_file:
            if line.startswith("commit_hash"):
                continue
            full_log_line = FullLogLine.parse_from_line(line, separator=sep)
            commit_vs_blobs[full_log_line.commit].append(full_log_line)
    return commit_vs_blobs
Example #7
0
def parse_full_log(full_log: Path) -> DefaultDict[Blob, List[NextBlobMetaInfo]]:
    blobs_history: DefaultDict[Blob, List[NextBlobMetaInfo]] = collections.defaultdict(list)

    with open(full_log, 'r') as full_log_file:
        for line in full_log_file:
            if line.startswith("commit_hash"):
                continue

            full_log_line: FullLogLine = FullLogLine.parse_from_line(line)
            blobs_history[full_log_line.old_blob].append(NextBlobMetaInfo(full_log_line.commit,
                                                                          full_log_line.new_blob,
                                                                          full_log_line.message))

    return blobs_history
Example #8
0
def create_new_full_log_file(old_full_log: Path, new_full_log: Path,
                             commits_for_train_file: Path):
    with open(commits_for_train_file, 'rb') as f:
        needed_commits: Set[Commit] = pickle.load(f)

    with open(new_full_log,
              'w') as new_full_log_file, open(old_full_log,
                                              'r') as old_full_log_file:
        for line in old_full_log_file:
            if line.startswith("commit_hash"):
                continue

            full_log_line: FullLogLine = FullLogLine.parse_from_line(line)
            if full_log_line.commit in needed_commits:
                new_full_log_file.write(line)
def get_repo_commit_vs_repo_blobs(
        full_log: Path,
        sep: str = SEPARATOR) -> Dict[Commit, List[FullLogLine]]:
    repo_commit_vs_repo_blobs: DefaultDict[
        Commit, List[FullLogLine]] = collections.defaultdict(list)

    with open(full_log, 'r') as full_log_file:
        for line in full_log_file:
            [repo_name, *full_line] = line.split(COMMON_SEP)
            full_log_line = FullLogLine.parse_from_line(
                COMMON_SEP.join(full_line), separator=sep)
            full_log_line.old_blob = repo_name + '_' + full_log_line.old_blob
            full_log_line.new_blob = repo_name + '_' + full_log_line.new_blob
            repo_commit_vs_repo_blobs[
                repo_name + '_' + full_log_line.commit].append(full_log_line)

    return repo_commit_vs_repo_blobs
Example #10
0
def download_blobs(dir_for_repos: Path, blobs_dir: Path,
                   common_blobs_file: Path) -> None:
    repo_vs_full_log_lines: Mapping[
        str, List[FullLogLine]] = collections.defaultdict(list)
    total_blobs_count: int = 0
    with open(common_blobs_file, 'r') as common_blobs_f:
        for line in common_blobs_f:
            total_blobs_count += 2
            [repo_name, *full_line] = line.split(COMMON_SEP)
            repo_vs_full_log_lines[repo_name].append(
                FullLogLine.parse_from_line(COMMON_SEP.join(full_line),
                                            COMMON_SEP))
    for repo_name in tqdm(repo_vs_full_log_lines.keys()):
        repo_name_for_clone = repo_name.replace('_', '/')
        if repo_name in repo_vs_full_log_lines:
            if do_we_need_to_clone_repo(repo_vs_full_log_lines[repo_name],
                                        repo_name, blobs_dir):
                pass
Example #11
0
def split_dataset_with_no_file_commits_for_train(full_log: Path, output: Path):
    """
    For camel dataset in com_log there is full history of camel repo.
    But for now I use only last 4000 commits
    only needed commits are in full_log.
    """

    commits: Set[Tuple[Message, Commit]] = set()
    with open(full_log, 'r') as f:
        for line in f:
            full_log_line: FullLogLine = FullLogLine.parse_from_line(line)
            commits.add((full_log_line.message, full_log_line.commit))
    commits: List[Tuple[Message, Commit]] = list(commits)
    print(f"Len of all commits = {len(commits)}")
    shuffle(commits)

    # split dataset
    commits_train, commits_tmp = split_in_two_parts(commits, 0.65)
    # wait while test-val messages won't be in train messages
    if is_messages_intersected(commits_train, commits_tmp):
        commits_train, commits_tmp = fix_intersection(commits_train,
                                                      commits_tmp)

    print(f"All size = {len(commits)}. Train size = {len(commits_train)}.")
    print(f"But train should be {0.7 * len(commits)}")
    print(f"Part size is {len(commits_train) / len(commits)}")

    commits_val, commits_test = split_in_two_parts(commits_tmp, 0.5)

    print(
        f"train: {len(commits_train)}, test: {len(commits_test)}, val: {len(commits_val)}"
    )

    splitted_commits = {
        DatasetPart.TRAIN: {commit
                            for _, commit in commits_train},
        DatasetPart.TEST: {commit
                           for _, commit in commits_test},
        DatasetPart.VAL: {commit
                          for _, commit in commits_val}
    }

    with open(output, 'wb') as output_file:
        pickle.dump(splitted_commits, output_file)
Example #12
0
    def testTestMessagesNotInTrainMessages(self):
        with open(self.split_commits, 'rb') as sdf:
            splitted_dataset: Dict[DatasetPart, Set[str]] = pickle.load(sdf)

        unique_messages: Dict[DatasetPart, Set[str]] = collections.defaultdict(set)
        all_messages: Dict[DatasetPart, List[str]] = collections.defaultdict(list)

        with open(self.full_log, 'r') as f:
            for line in f:
                full_log_line: FullLogLine = FullLogLine.parse_from_line(line)

                key = [key for key, commits in splitted_dataset.items() if full_log_line.commit in commits]
                if len(key) == 1:
                    key = key[0]
                    unique_messages[key].add(full_log_line.message)
                    all_messages[key].append(full_log_line.message)

        print(f"tr: {len(unique_messages[DatasetPart.TRAIN])}, {len(all_messages[DatasetPart.TRAIN])}")
        print(f"t: {len(unique_messages[DatasetPart.TEST])}, {len(all_messages[DatasetPart.TEST])}")
        print(f"v: {len(unique_messages[DatasetPart.VAL])}, {len(all_messages[DatasetPart.VAL])}")

        print(f"tr & t {len(unique_messages[DatasetPart.TRAIN] & unique_messages[DatasetPart.TEST])}")
        print(f"tr & v {len(unique_messages[DatasetPart.TRAIN] & unique_messages[DatasetPart.VAL])}")
Example #13
0
def insert_results_in_common_csv(full_log: Path, code2seq: Path, output: Path):
    processed_commits: Set[Commit] = set()
    i, k = 0, 0

    commits_vs_positions: DefaultDict[Commit, List[Tuple[
        int, int]]] = parse_result_file(code2seq)
    print(f"Finishe parse file {code2seq}")
    output_line_template = Template(
        '$commit$sep$file$sep$status$sep'
        '$original_message$sep$function_name$sep$predicted_message$sep\n')

    with open(full_log,
              'r') as full_log_file, open(output, 'w') as output_file, open(
                  code2seq, 'rb') as code2seq_file:
        for line in full_log_file:
            i += 1
            if i % 20 == 0:
                print(f"{i} line in full log")
            if line.startswith("commit_hash"):
                continue

            full_log_line: FullLogLine = FullLogLine.parse_from_line(line)
            message: Message = full_log_line.message

            if message.startswith("This commit was manufactured by cvs2svn"):
                if full_log_line.commit not in processed_commits:
                    output_file.write(
                        output_line_template.substitute(
                            commit=full_log_line.commit,
                            file=full_log_line.file,
                            status=full_log_line.status,
                            original_message=message,
                            function_name="",
                            predicted_message="",
                            sep="^"))
                    processed_commits.add(full_log_line.commit)
                else:
                    output_file.write(
                        output_line_template.substitute(
                            commit="",
                            file=full_log_line.file,
                            status=full_log_line.status,
                            original_message=message,
                            function_name="",
                            predicted_message="",
                            sep="^"))
                continue

            predicted_results: List[
                PredictedResults] = PredictedResults.find_results_for_commit_and_blobs(
                    commits_vs_positions[full_log_line.commit],
                    full_log_line.old_blob, full_log_line.new_blob,
                    code2seq_file)

            if message == "no message" or message == "*** empty log message ***":
                message = Message(" ")
            if len(predicted_results) == 0:
                if full_log_line.commit not in processed_commits:
                    output_file.write(
                        output_line_template.substitute(
                            commit=full_log_line.commit,
                            file=full_log_line.file,
                            status=full_log_line.status,
                            original_message=message,
                            function_name="",
                            predicted_message="",
                            sep="^"))
                    processed_commits.add(full_log_line.commit)
                else:
                    output_file.write(
                        output_line_template.substitute(
                            commit="",
                            file=full_log_line.file,
                            status=full_log_line.status,
                            original_message=message,
                            function_name="",
                            predicted_message="",
                            sep="^"))
            else:
                for prediction in predicted_results:
                    k += 1
                    if k % 10 == 0:
                        print(f"write {k} generated annotation")
                    if full_log_line.commit not in processed_commits:
                        output_file.write(
                            output_line_template.substitute(
                                commit=full_log_line.commit,
                                file=full_log_line.file,
                                status=full_log_line.status,
                                original_message=message,
                                function_name=prediction.function_name,
                                predicted_message=prediction.predicted_message,
                                sep="^"))
                    else:
                        output_file.write(
                            output_line_template.substitute(
                                commit="",
                                file=full_log_line.file,
                                status=full_log_line.status,
                                original_message=message,
                                function_name=prediction.function_name,
                                predicted_message=prediction.predicted_message,
                                sep="^"))
                    processed_commits.add(full_log_line.commit)
Example #14
0
def rename_already_downloaded(blobs_dir: Path, common_blob_log: Path) -> None:
    repo_vs_full_l_lines: Mapping[
        str, List[FullLogLine]] = collections.defaultdict(list)
    blob_hash_vs_repo: Dict[str, str] = {}
    total_blobs_count: int = 0
    with open(common_blob_log, 'r') as common_blobs_f:
        for line in common_blobs_f:
            total_blobs_count += 2
            [repo_name, *full_line] = line.split(COMMON_SEP)
            log_line = FullLogLine.parse_from_line(COMMON_SEP.join(full_line),
                                                   COMMON_SEP)
            repo_vs_full_l_lines[repo_name].append(log_line)
            blob_hash_vs_repo[log_line.old_blob] = repo_name
            blob_hash_vs_repo[log_line.new_blob] = repo_name

    with open(common_blob_log.parent.joinpath('blobs_collisions.pickle'),
              'rb') as f:
        blobs_hashes_collisions = pickle.load(f)

    all_downloaded_blobs: List[str] = os.listdir(blobs_dir)
    blob_hash_len = len('e6a92d4554046b88461778b99dcce88040aeb6a7')
    blob_hash_len_with_java = len(
        'e6a92d4554046b88461778b99dcce88040aeb6a7.java')
    i = 0
    for downloaded_file in tqdm(all_downloaded_blobs):
        if downloaded_file.endswith('.java'):
            blob_hash = downloaded_file[-blob_hash_len_with_java:-len('.java')]
        else:
            blob_hash = downloaded_file[-blob_hash_len:]

        if blob_hash == '048a922456c7502b508ad50e0ab1735af4aa1beb':
            print(downloaded_file)
            print(f'{blob_hash_vs_repo[blob_hash]}_{blob_hash}.java')
        if blob_hash_vs_repo[blob_hash] == 'kickstarter_android-oss':
            correct_file_name = f'{blob_hash_vs_repo[blob_hash]}_{blob_hash}.java'
            (blobs_dir / downloaded_file).rename(blobs_dir / correct_file_name)
        continue

        # если повторяющийся блоб, то надо будет сто пудово его перекачивать
        if blob_hash in blobs_hashes_collisions:
            (blobs_dir / downloaded_file).unlink()
        else:
            # как должен называться файл
            correct_file_name = f'{blob_hash_vs_repo[blob_hash]}_{blob_hash}.java'
            # если его не было, значит мы скачали новый, но неправильно его назвали
            if correct_file_name not in all_downloaded_blobs:
                pass
                # if downloaded_file.endswith('.java'):
                #     if downloaded_file != correct_file_name:
                #         # rename
                #         (blobs_dir / downloaded_file).rename(blobs_dir / correct_file_name)
                # else:
                #     if downloaded_file != correct_file_name[:-len('.java')]:
                #         # rename
                #         (blobs_dir / downloaded_file).rename(blobs_dir / correct_file_name)
            else:
                if downloaded_file != correct_file_name:
                    i += 1
                    # print(downloaded_file)
                    # (blobs_dir / downloaded_file).unlink()

    print(i)
Example #15
0
def check_downloaded_blobs(blobs_dir: Path, common_blobs_file: Path) -> None:
    repo_vs_full_log_lines: Mapping[
        str, List[FullLogLine]] = collections.defaultdict(list)
    total_blobs_count: int = 0
    with open(common_blobs_file, 'r') as common_blobs_f:
        for line in common_blobs_f:
            total_blobs_count += 2
            [repo_name, *full_line] = line.split(COMMON_SEP)
            repo_vs_full_log_lines[repo_name].append(
                FullLogLine.parse_from_line(COMMON_SEP.join(full_line),
                                            COMMON_SEP))
            # if total_blobs_count > 17000:
            #     break

    all_downloaded_blobs = os.listdir(blobs_dir)
    all_needed_blobs = set()
    print(f'len all downloaded {len(all_downloaded_blobs)}')
    failed_repo_vs_count = collections.defaultdict(set)
    must_be_repo_vs_count = collections.defaultdict(set)
    list_blobs_hash = []
    repo_vs_blobs = collections.defaultdict(set)
    all_blobs_set = set()
    repos_with_collisions = 0
    collisions_blobs = []
    for repo_name in tqdm(repo_vs_full_log_lines.keys()):
        for changed_file in repo_vs_full_log_lines[repo_name]:
            if changed_file.old_blob != '0000000000000000000000000000000000000000':
                list_blobs_hash.append(changed_file.old_blob)
                repo_vs_blobs[repo_name].add(changed_file.old_blob)
                all_needed_blobs.add(
                    f'{repo_name}_{changed_file.old_blob}.java')
            if changed_file.new_blob != '0000000000000000000000000000000000000000':
                list_blobs_hash.append(changed_file.new_blob)
                repo_vs_blobs[repo_name].add(changed_file.new_blob)
                all_needed_blobs.add(
                    f'{repo_name}_{changed_file.new_blob}.java')
        if len(all_needed_blobs & repo_vs_blobs[repo_name]) != 0:
            repos_with_collisions += 1
            collisions_blobs.extend(all_needed_blobs
                                    & repo_vs_blobs[repo_name])

    print(f'Total blobs number = {total_blobs_count}')
    print(
        f'collisions in blobs hash list = {len(list_blobs_hash)}, set = {len(set(list_blobs_hash))}'
    )
    print('collisions blobs')
    print(set(collisions_blobs))
    print(len(collisions_blobs))
    print(len(set(collisions_blobs)))
    print(f'repos with collisions number {repos_with_collisions}')

    sorted = collections.Counter(list_blobs_hash)
    i = 0
    for hash, count in sorted.items():
        if count > 2:
            # print(f'{hash} - {count}')
            i += 1

    print(
        f"with java  = {len([item for item in all_downloaded_blobs if item.endswith('.java')])}"
    )
    print(
        f"not with java  = {len([item for item in all_downloaded_blobs if not item.endswith('.java')])}"
    )
    not_with_java = [
        item for item in all_downloaded_blobs if not item.endswith('.java')
    ]
    blob_hash_len = len('87601ba3a5911c5d38d426baa530e414d32cc027')
    for file_ in not_with_java:
        blob_hash = file_[-blob_hash_len:]
        if blob_hash in collisions_blobs:
            print(f'blob from collision')
        else:
            print(blob_hash)
    repo_name = [name[:-blob_hash_len] for name in not_with_java]

    print(list(set(all_downloaded_blobs) - all_needed_blobs)[:10])
    repos_names_that_bugs = [
        blob[:-len('87601ba3a5911c5d38d426baa530e414d32cc027')]
        for blob in (set(all_downloaded_blobs) - all_needed_blobs)
    ]
    print(set(repos_names_that_bugs))
    for repo in failed_repo_vs_count.keys():
        print(
            f'{repo} - in common {len(must_be_repo_vs_count[repo] & failed_repo_vs_count[repo])}'
            f' len must be {len(must_be_repo_vs_count[repo])}'
            f' len failed {len(failed_repo_vs_count[repo])}')