def get_downloaded_blobs_stat_collisions(blobs_dir: Path, common_blobs_file: Path) -> None: repo_vs_full_log_lines: Mapping[ str, List[FullLogLine]] = collections.defaultdict(list) full_log_line_list = [] total_blobs_count: int = 0 with open(common_blobs_file, 'r') as common_blobs_f: for line in common_blobs_f: total_blobs_count += 2 [repo_name, *full_line] = line.split(COMMON_SEP) full_log_line_list.append(COMMON_SEP.join(full_line)) repo_vs_full_log_lines[repo_name].append( FullLogLine.parse_from_line(COMMON_SEP.join(full_line), COMMON_SEP)) common_blobs = set() sorted = collections.Counter(full_log_line_list) for line, count in sorted.items(): if count > 1: # print(f'{count} - {line}') fll = FullLogLine.parse_from_line(line, COMMON_SEP) common_blobs.add(fll.old_blob) common_blobs.add(fll.new_blob) print(f'count of common blobs is {len(common_blobs)}') all_blobs_hashes = set() blobs_hashes_collisions = set() repo_vs_blobs_hashes = collections.defaultdict(set) for repo_name in tqdm(repo_vs_full_log_lines.keys()): for changed_file in repo_vs_full_log_lines[repo_name]: if changed_file.old_blob != '0000000000000000000000000000000000000000': repo_vs_blobs_hashes[repo_name].add(changed_file.old_blob) if changed_file.new_blob != '0000000000000000000000000000000000000000': repo_vs_blobs_hashes[repo_name].add(changed_file.new_blob) # check there is no collision for this repo possible_collisions = all_blobs_hashes & repo_vs_blobs_hashes[repo_name] if len(possible_collisions) != 0: blobs_hashes_collisions.update(possible_collisions) all_blobs_hashes.update(repo_vs_blobs_hashes[repo_name]) print(f'Collisions number {len(blobs_hashes_collisions)}') print(blobs_hashes_collisions - common_blobs) print(f'Intersection = {len(blobs_hashes_collisions & common_blobs)}') print(list(blobs_hashes_collisions)[:10]) print(f'all blobs hashes {len(all_blobs_hashes)}') # with open(common_blobs_file.parent.joinpath('blobs_collisions.pickle'), 'wb') as f: # pickle.dump(blobs_hashes_collisions, f) all_downloaded_files = os.listdir(blobs_dir) blob_hash_len_with_java, len_java = len( 'e6a92d4554046b88461778b99dcce88040aeb6a7.java'), len('.java') for downloaded_file in all_downloaded_files: blob_hash = downloaded_file[-blob_hash_len_with_java:-len_java] if blob_hash in common_blobs: print(downloaded_file)
def download_blobs_from_filtered_commits(changed_files_log: Path, lines_range: Tuple[int, int], filtered_commits_json: Path, blobs_dir: Path, git_dir: Path) -> None: start_line = lines_range[0] end_line = lines_range[1] print(f"start line: {start_line}, end_line: {end_line}") with open(filtered_commits_json, 'r') as f: commits = json.load(f) commits: List[CommitDiff] = [CommitDiff.from_dict(commit) for commit in commits] filtered_commits: Set[Commit] = {commit.commit for commit in commits} print(f'Number of filtered commits is {len(filtered_commits)}') with open(changed_files_log, 'r') as full_log_file: i = 0 for i in range(start_line): full_log_file.readline() i += 1 for line in full_log_file: i += 1 if start_line <= i and i <= end_line: if i % 20 == 0: print(f"Start to process {i} file; ({start_line}, {end_line})") full_log_line: FullLogLine = FullLogLine.parse_from_line(line, COMMON_SEP) if full_log_line.commit not in filtered_commits: continue all_blobs = os.listdir(blobs_dir) if full_log_line.old_blob not in all_blobs: download_blob_content(full_log_line.old_blob, blobs_dir, git_dir) if full_log_line.new_blob not in all_blobs: download_blob_content(full_log_line.new_blob, blobs_dir, git_dir)
def download_blobs_parallel(common_blobs_file: Path) -> None: repo_vs_full_log_lines: Dict[ str, List[FullLogLine]] = collections.defaultdict(list) total_blobs_count: int = 0 with open(common_blobs_file, 'r') as common_blobs_f: for line in common_blobs_f: total_blobs_count += 2 [repo_name, *full_line] = line.split(COMMON_SEP) repo_vs_full_log_lines[repo_name].append( FullLogLine.parse_from_line(COMMON_SEP.join(full_line), COMMON_SEP)) print(f'Total blobs count = {total_blobs_count}') print( f'We needed to download data for {len(repo_vs_full_log_lines)} repos, but we already have one part' ) blobs_dir = Path('../../../new_data/raw_data/blobs_200') repos_not_to_download = [] downloaded_blobs_files = set(os.listdir(blobs_dir)) for repo in tqdm(repo_vs_full_log_lines.keys()): if not do_we_need_to_clone_repo(repo_vs_full_log_lines[repo], repo, blobs_dir, downloaded_blobs_files): repos_not_to_download.append(repo) for del_repos in repos_not_to_download: del repo_vs_full_log_lines[del_repos] print(len(repo_vs_full_log_lines['aosp-mirror_platform_frameworks_base'])) print(f'So now we need to download {len(repo_vs_full_log_lines)} repos') repos = repo_vs_full_log_lines.keys() repos = [(repo, downloaded_blobs_files) for repo in repos] for _ in tqdm(multiprocessing.Pool().imap_unordered( process_one_dir, repos), total=len(repos)): pass
def process_one_dir(repo_name_input): repo_name_input, download_blobs_files = repo_name_input blobs_dir: Path = Path('../../../new_data/raw_data/blobs_200') filtered_diff_dir: Path = Path( '../../../new_data/processed_data/two_inputs_200') dir_for_repos = filtered_diff_dir.parent common_blobs_file: Path = filtered_diff_dir.parent.joinpath( 'common_blobs_200.log') full_log_lines: List[FullLogLine] = [] with open(common_blobs_file, 'r') as common_blobs_f: for line in common_blobs_f: [repo_name, *full_line] = line.split(COMMON_SEP) if repo_name == repo_name_input: full_log_lines.append( FullLogLine.parse_from_line(COMMON_SEP.join(full_line), COMMON_SEP)) repo_name_for_clone = repo_name_input.replace('_', '/') if repo_name_for_clone == 'aosp-mirror/platform/frameworks/base': repo_name_for_clone = 'aosp-mirror/platform_frameworks_base' if full_log_lines: with LocalGitRepository(repo_name_for_clone, dir_for_repos) as git_dir: for log_line in full_log_lines: if not is_blob_downloaded(log_line.old_blob, repo_name_input, blobs_dir, download_blobs_files): download_blob_content_other_output_file_name( log_line.old_blob, repo_name_input, blobs_dir, git_dir) if not is_blob_downloaded(log_line.new_blob, repo_name_input, blobs_dir, download_blobs_files): download_blob_content_other_output_file_name( log_line.new_blob, repo_name_input, blobs_dir, git_dir)
def get_commits_from_full_log(full_log: Path) -> Set[Commit]: result: Set[Commit] = set() with open(full_log, 'r') as f: for line in f: full_log_line: FullLogLine = FullLogLine.parse_from_line(line) result.add(full_log_line.commit) return result
def get_commit_vs_blobs( full_log: Path, sep: str = SEPARATOR) -> Dict[Commit, List[FullLogLine]]: commit_vs_blobs: DefaultDict[ Commit, List[FullLogLine]] = collections.defaultdict(list) with open(full_log, 'r', encoding='utf-8') as full_log_file: for line in full_log_file: if line.startswith("commit_hash"): continue full_log_line = FullLogLine.parse_from_line(line, separator=sep) commit_vs_blobs[full_log_line.commit].append(full_log_line) return commit_vs_blobs
def parse_full_log(full_log: Path) -> DefaultDict[Blob, List[NextBlobMetaInfo]]: blobs_history: DefaultDict[Blob, List[NextBlobMetaInfo]] = collections.defaultdict(list) with open(full_log, 'r') as full_log_file: for line in full_log_file: if line.startswith("commit_hash"): continue full_log_line: FullLogLine = FullLogLine.parse_from_line(line) blobs_history[full_log_line.old_blob].append(NextBlobMetaInfo(full_log_line.commit, full_log_line.new_blob, full_log_line.message)) return blobs_history
def create_new_full_log_file(old_full_log: Path, new_full_log: Path, commits_for_train_file: Path): with open(commits_for_train_file, 'rb') as f: needed_commits: Set[Commit] = pickle.load(f) with open(new_full_log, 'w') as new_full_log_file, open(old_full_log, 'r') as old_full_log_file: for line in old_full_log_file: if line.startswith("commit_hash"): continue full_log_line: FullLogLine = FullLogLine.parse_from_line(line) if full_log_line.commit in needed_commits: new_full_log_file.write(line)
def get_repo_commit_vs_repo_blobs( full_log: Path, sep: str = SEPARATOR) -> Dict[Commit, List[FullLogLine]]: repo_commit_vs_repo_blobs: DefaultDict[ Commit, List[FullLogLine]] = collections.defaultdict(list) with open(full_log, 'r') as full_log_file: for line in full_log_file: [repo_name, *full_line] = line.split(COMMON_SEP) full_log_line = FullLogLine.parse_from_line( COMMON_SEP.join(full_line), separator=sep) full_log_line.old_blob = repo_name + '_' + full_log_line.old_blob full_log_line.new_blob = repo_name + '_' + full_log_line.new_blob repo_commit_vs_repo_blobs[ repo_name + '_' + full_log_line.commit].append(full_log_line) return repo_commit_vs_repo_blobs
def download_blobs(dir_for_repos: Path, blobs_dir: Path, common_blobs_file: Path) -> None: repo_vs_full_log_lines: Mapping[ str, List[FullLogLine]] = collections.defaultdict(list) total_blobs_count: int = 0 with open(common_blobs_file, 'r') as common_blobs_f: for line in common_blobs_f: total_blobs_count += 2 [repo_name, *full_line] = line.split(COMMON_SEP) repo_vs_full_log_lines[repo_name].append( FullLogLine.parse_from_line(COMMON_SEP.join(full_line), COMMON_SEP)) for repo_name in tqdm(repo_vs_full_log_lines.keys()): repo_name_for_clone = repo_name.replace('_', '/') if repo_name in repo_vs_full_log_lines: if do_we_need_to_clone_repo(repo_vs_full_log_lines[repo_name], repo_name, blobs_dir): pass
def split_dataset_with_no_file_commits_for_train(full_log: Path, output: Path): """ For camel dataset in com_log there is full history of camel repo. But for now I use only last 4000 commits only needed commits are in full_log. """ commits: Set[Tuple[Message, Commit]] = set() with open(full_log, 'r') as f: for line in f: full_log_line: FullLogLine = FullLogLine.parse_from_line(line) commits.add((full_log_line.message, full_log_line.commit)) commits: List[Tuple[Message, Commit]] = list(commits) print(f"Len of all commits = {len(commits)}") shuffle(commits) # split dataset commits_train, commits_tmp = split_in_two_parts(commits, 0.65) # wait while test-val messages won't be in train messages if is_messages_intersected(commits_train, commits_tmp): commits_train, commits_tmp = fix_intersection(commits_train, commits_tmp) print(f"All size = {len(commits)}. Train size = {len(commits_train)}.") print(f"But train should be {0.7 * len(commits)}") print(f"Part size is {len(commits_train) / len(commits)}") commits_val, commits_test = split_in_two_parts(commits_tmp, 0.5) print( f"train: {len(commits_train)}, test: {len(commits_test)}, val: {len(commits_val)}" ) splitted_commits = { DatasetPart.TRAIN: {commit for _, commit in commits_train}, DatasetPart.TEST: {commit for _, commit in commits_test}, DatasetPart.VAL: {commit for _, commit in commits_val} } with open(output, 'wb') as output_file: pickle.dump(splitted_commits, output_file)
def testTestMessagesNotInTrainMessages(self): with open(self.split_commits, 'rb') as sdf: splitted_dataset: Dict[DatasetPart, Set[str]] = pickle.load(sdf) unique_messages: Dict[DatasetPart, Set[str]] = collections.defaultdict(set) all_messages: Dict[DatasetPart, List[str]] = collections.defaultdict(list) with open(self.full_log, 'r') as f: for line in f: full_log_line: FullLogLine = FullLogLine.parse_from_line(line) key = [key for key, commits in splitted_dataset.items() if full_log_line.commit in commits] if len(key) == 1: key = key[0] unique_messages[key].add(full_log_line.message) all_messages[key].append(full_log_line.message) print(f"tr: {len(unique_messages[DatasetPart.TRAIN])}, {len(all_messages[DatasetPart.TRAIN])}") print(f"t: {len(unique_messages[DatasetPart.TEST])}, {len(all_messages[DatasetPart.TEST])}") print(f"v: {len(unique_messages[DatasetPart.VAL])}, {len(all_messages[DatasetPart.VAL])}") print(f"tr & t {len(unique_messages[DatasetPart.TRAIN] & unique_messages[DatasetPart.TEST])}") print(f"tr & v {len(unique_messages[DatasetPart.TRAIN] & unique_messages[DatasetPart.VAL])}")
def insert_results_in_common_csv(full_log: Path, code2seq: Path, output: Path): processed_commits: Set[Commit] = set() i, k = 0, 0 commits_vs_positions: DefaultDict[Commit, List[Tuple[ int, int]]] = parse_result_file(code2seq) print(f"Finishe parse file {code2seq}") output_line_template = Template( '$commit$sep$file$sep$status$sep' '$original_message$sep$function_name$sep$predicted_message$sep\n') with open(full_log, 'r') as full_log_file, open(output, 'w') as output_file, open( code2seq, 'rb') as code2seq_file: for line in full_log_file: i += 1 if i % 20 == 0: print(f"{i} line in full log") if line.startswith("commit_hash"): continue full_log_line: FullLogLine = FullLogLine.parse_from_line(line) message: Message = full_log_line.message if message.startswith("This commit was manufactured by cvs2svn"): if full_log_line.commit not in processed_commits: output_file.write( output_line_template.substitute( commit=full_log_line.commit, file=full_log_line.file, status=full_log_line.status, original_message=message, function_name="", predicted_message="", sep="^")) processed_commits.add(full_log_line.commit) else: output_file.write( output_line_template.substitute( commit="", file=full_log_line.file, status=full_log_line.status, original_message=message, function_name="", predicted_message="", sep="^")) continue predicted_results: List[ PredictedResults] = PredictedResults.find_results_for_commit_and_blobs( commits_vs_positions[full_log_line.commit], full_log_line.old_blob, full_log_line.new_blob, code2seq_file) if message == "no message" or message == "*** empty log message ***": message = Message(" ") if len(predicted_results) == 0: if full_log_line.commit not in processed_commits: output_file.write( output_line_template.substitute( commit=full_log_line.commit, file=full_log_line.file, status=full_log_line.status, original_message=message, function_name="", predicted_message="", sep="^")) processed_commits.add(full_log_line.commit) else: output_file.write( output_line_template.substitute( commit="", file=full_log_line.file, status=full_log_line.status, original_message=message, function_name="", predicted_message="", sep="^")) else: for prediction in predicted_results: k += 1 if k % 10 == 0: print(f"write {k} generated annotation") if full_log_line.commit not in processed_commits: output_file.write( output_line_template.substitute( commit=full_log_line.commit, file=full_log_line.file, status=full_log_line.status, original_message=message, function_name=prediction.function_name, predicted_message=prediction.predicted_message, sep="^")) else: output_file.write( output_line_template.substitute( commit="", file=full_log_line.file, status=full_log_line.status, original_message=message, function_name=prediction.function_name, predicted_message=prediction.predicted_message, sep="^")) processed_commits.add(full_log_line.commit)
def rename_already_downloaded(blobs_dir: Path, common_blob_log: Path) -> None: repo_vs_full_l_lines: Mapping[ str, List[FullLogLine]] = collections.defaultdict(list) blob_hash_vs_repo: Dict[str, str] = {} total_blobs_count: int = 0 with open(common_blob_log, 'r') as common_blobs_f: for line in common_blobs_f: total_blobs_count += 2 [repo_name, *full_line] = line.split(COMMON_SEP) log_line = FullLogLine.parse_from_line(COMMON_SEP.join(full_line), COMMON_SEP) repo_vs_full_l_lines[repo_name].append(log_line) blob_hash_vs_repo[log_line.old_blob] = repo_name blob_hash_vs_repo[log_line.new_blob] = repo_name with open(common_blob_log.parent.joinpath('blobs_collisions.pickle'), 'rb') as f: blobs_hashes_collisions = pickle.load(f) all_downloaded_blobs: List[str] = os.listdir(blobs_dir) blob_hash_len = len('e6a92d4554046b88461778b99dcce88040aeb6a7') blob_hash_len_with_java = len( 'e6a92d4554046b88461778b99dcce88040aeb6a7.java') i = 0 for downloaded_file in tqdm(all_downloaded_blobs): if downloaded_file.endswith('.java'): blob_hash = downloaded_file[-blob_hash_len_with_java:-len('.java')] else: blob_hash = downloaded_file[-blob_hash_len:] if blob_hash == '048a922456c7502b508ad50e0ab1735af4aa1beb': print(downloaded_file) print(f'{blob_hash_vs_repo[blob_hash]}_{blob_hash}.java') if blob_hash_vs_repo[blob_hash] == 'kickstarter_android-oss': correct_file_name = f'{blob_hash_vs_repo[blob_hash]}_{blob_hash}.java' (blobs_dir / downloaded_file).rename(blobs_dir / correct_file_name) continue # если повторяющийся блоб, то надо будет сто пудово его перекачивать if blob_hash in blobs_hashes_collisions: (blobs_dir / downloaded_file).unlink() else: # как должен называться файл correct_file_name = f'{blob_hash_vs_repo[blob_hash]}_{blob_hash}.java' # если его не было, значит мы скачали новый, но неправильно его назвали if correct_file_name not in all_downloaded_blobs: pass # if downloaded_file.endswith('.java'): # if downloaded_file != correct_file_name: # # rename # (blobs_dir / downloaded_file).rename(blobs_dir / correct_file_name) # else: # if downloaded_file != correct_file_name[:-len('.java')]: # # rename # (blobs_dir / downloaded_file).rename(blobs_dir / correct_file_name) else: if downloaded_file != correct_file_name: i += 1 # print(downloaded_file) # (blobs_dir / downloaded_file).unlink() print(i)
def check_downloaded_blobs(blobs_dir: Path, common_blobs_file: Path) -> None: repo_vs_full_log_lines: Mapping[ str, List[FullLogLine]] = collections.defaultdict(list) total_blobs_count: int = 0 with open(common_blobs_file, 'r') as common_blobs_f: for line in common_blobs_f: total_blobs_count += 2 [repo_name, *full_line] = line.split(COMMON_SEP) repo_vs_full_log_lines[repo_name].append( FullLogLine.parse_from_line(COMMON_SEP.join(full_line), COMMON_SEP)) # if total_blobs_count > 17000: # break all_downloaded_blobs = os.listdir(blobs_dir) all_needed_blobs = set() print(f'len all downloaded {len(all_downloaded_blobs)}') failed_repo_vs_count = collections.defaultdict(set) must_be_repo_vs_count = collections.defaultdict(set) list_blobs_hash = [] repo_vs_blobs = collections.defaultdict(set) all_blobs_set = set() repos_with_collisions = 0 collisions_blobs = [] for repo_name in tqdm(repo_vs_full_log_lines.keys()): for changed_file in repo_vs_full_log_lines[repo_name]: if changed_file.old_blob != '0000000000000000000000000000000000000000': list_blobs_hash.append(changed_file.old_blob) repo_vs_blobs[repo_name].add(changed_file.old_blob) all_needed_blobs.add( f'{repo_name}_{changed_file.old_blob}.java') if changed_file.new_blob != '0000000000000000000000000000000000000000': list_blobs_hash.append(changed_file.new_blob) repo_vs_blobs[repo_name].add(changed_file.new_blob) all_needed_blobs.add( f'{repo_name}_{changed_file.new_blob}.java') if len(all_needed_blobs & repo_vs_blobs[repo_name]) != 0: repos_with_collisions += 1 collisions_blobs.extend(all_needed_blobs & repo_vs_blobs[repo_name]) print(f'Total blobs number = {total_blobs_count}') print( f'collisions in blobs hash list = {len(list_blobs_hash)}, set = {len(set(list_blobs_hash))}' ) print('collisions blobs') print(set(collisions_blobs)) print(len(collisions_blobs)) print(len(set(collisions_blobs))) print(f'repos with collisions number {repos_with_collisions}') sorted = collections.Counter(list_blobs_hash) i = 0 for hash, count in sorted.items(): if count > 2: # print(f'{hash} - {count}') i += 1 print( f"with java = {len([item for item in all_downloaded_blobs if item.endswith('.java')])}" ) print( f"not with java = {len([item for item in all_downloaded_blobs if not item.endswith('.java')])}" ) not_with_java = [ item for item in all_downloaded_blobs if not item.endswith('.java') ] blob_hash_len = len('87601ba3a5911c5d38d426baa530e414d32cc027') for file_ in not_with_java: blob_hash = file_[-blob_hash_len:] if blob_hash in collisions_blobs: print(f'blob from collision') else: print(blob_hash) repo_name = [name[:-blob_hash_len] for name in not_with_java] print(list(set(all_downloaded_blobs) - all_needed_blobs)[:10]) repos_names_that_bugs = [ blob[:-len('87601ba3a5911c5d38d426baa530e414d32cc027')] for blob in (set(all_downloaded_blobs) - all_needed_blobs) ] print(set(repos_names_that_bugs)) for repo in failed_repo_vs_count.keys(): print( f'{repo} - in common {len(must_be_repo_vs_count[repo] & failed_repo_vs_count[repo])}' f' len must be {len(must_be_repo_vs_count[repo])}' f' len failed {len(failed_repo_vs_count[repo])}')