def add_paths_to_one_report(report: Report, commit_files: Dict[str, List[str]], file_limit: int) -> Report: frames_with_paths = [] for frame in report.frames[:file_limit]: matching_files_for_frame = commit_files[frame.meta['file_name']] frame_path = find_file_for_frame(frame, matching_files_for_frame) frame_meta = frame.meta frame_meta['path'] = frame_path frames_with_paths.append(Frame(frame.code, frame_meta)) return Report(report.id, report.exceptions, report.hash, frames_with_paths)
def label_frames(report: Report, commit_hash: str, methods_info: pd.DataFrame) -> Report: fixed_methods = methods_info.fixed_method.values paths = methods_info.path.apply(lambda x: x.split('/')[-1]) frames_with_labels = [] for frame in report.frames: label = frame_label(frame, fixed_methods, paths) frame_meta = frame.meta frame_meta['label'] = label frames_with_labels.append(Frame(frame.code, frame_meta)) return Report(report.id, report.exceptions, commit_hash, frames_with_labels)
def collect_sources_for_all_reports(repo: Repo, path_to_reports: str, file_limit=80): reports_success = 0 for file_name in iterate_reports(path_to_reports): path_to_file = os.path.join(path_to_reports, file_name) report = Report.load_report(path_to_file) if report.hash == "": continue report = get_sources_for_report(repo, report, report.hash, file_limit) report.save_report(path_to_file) reports_success += 1 print(f"Successed collect code data for {reports_success} reports.")
def add_git_data(repo_path: str, data_dir: str, frame_limit: int): repo = Repo(repo_path, odbt=db.GitDB) path_to_reports = os.path.join(data_dir, REPORTS_INTERMEDIATE_DIR) reports_save_path = os.path.join(data_dir, FINAL_REPORTS_DIR) os.makedirs(reports_save_path, exist_ok=True) for file_name in iterate_reports(path_to_reports): report_path = os.path.join(path_to_reports, file_name) report = Report.load_report(report_path) report = add_git_data_to_frames(repo, report, frame_limit) report_save_path = os.path.join(reports_save_path, file_name) report.save_report(report_save_path) print(f"Successfully collect git data for.")
def get_sources_for_report(repo: Repo, report: Report, commit: str, file_limit: int) -> Report: frames_with_codes = [] for frame in report.frames[:file_limit]: if frame.meta['path'] != '': diff_file = frame.meta['path'] try: code = get_file_by_commit(repo, commit + "~1", diff_file) hashed_code = base64.b64encode(code.encode('UTF-8')) frame_code = hashed_code frames_with_codes.append(Frame(frame_code, frame.meta)) except Exception: print(report.id, frame.meta['file_name']) return Report(report.id, report.exceptions, report.hash, frames_with_codes)
def add_git_data_to_frames(repo: Repo, report: Report, frame_limit: int = 80) -> Report: commit_hash = report.hash frames = [] for frame in report.frames[:frame_limit]: is_saved = False frame_meta = frame.meta if commit_hash and "path" in frame.meta and frame.meta['path'] != "": is_saved = save_commit_file_info(repo, frame, frame.meta['path'], commit_hash) if not is_saved: frame_meta['committed_date'] = 0 frame_meta['authored_date'] = 0 frame_meta['author'] = 'no_author' frames.append(Frame(frame.code, frame_meta)) return Report(report.id, report.exceptions, report.hash, frames)
def get_all_embeddings(data_dir: str, embs_name: str, files_limit: int = 80): embeddings = {} file_embeddings_cache = {} model = Code2Seq.load("java") path_to_reports = os.path.join(data_dir, FINAL_REPORTS_DIR) for file_name in iterate_reports(path_to_reports): report = Report.load_report(os.path.join(path_to_reports, file_name)) commit_hash = report.hash for frame in report.frames: method_name = frame.meta["method_name"] method_key = (commit_hash, method_name) file_key = (commit_hash, frame.meta["path"]) if file_key not in file_embeddings_cache: file_embeddings_cache[file_key] = get_file_embeddings( model, frame.get_code_decoded()) if method_key not in embeddings: embeddings[method_key] = get_method_embedding( file_embeddings_cache[file_key], method_name) with open(os.path.join(data_dir, embs_name), 'w') as f: pickle.dump(embeddings, f)
def label_reports(issues_info: pd.DataFrame, path_to_reports: str, path_to_reports_save: str): reports_success = 0 for file_name in iterate_reports(path_to_reports, ext='.json'): path_to_file = os.path.join(path_to_reports, file_name) report = Report.load_from_base_report(path_to_file) fixed_methods = find_fixed_method_for_report(issues_info, report.id) report_hash = report.hash if fixed_methods.shape[0] != 0: report_hash = get_hash(report.id, issues_info) report = label_frames(report, report_hash, fixed_methods) reports_success += 1 if sum( [frame.meta['label'] for frame in report.frames]) else 0 if report.id != 0: report.save_report( os.path.join(path_to_reports_save, file_name.split('.')[0])) print(f"Successed label data for {reports_success} reports.")
def add_paths_to_all_reports(from_repo: Repo, path_to_reports: str, path_to_reports_save: str, file_limit: int = 80): reports_success = 0 for file_name in iterate_reports(path_to_reports): path_to_file = os.path.join(path_to_reports, file_name) report = Report.load_report(path_to_file) if report.id == 0: continue try: commit = from_repo.commit(report.hash + '~1') commit_files = list_files_in_commit(commit, from_repo) except Exception: continue report = add_paths_to_one_report(report, commit_files, file_limit=file_limit) report.save_report(os.path.join(path_to_reports_save, file_name)) reports_success += 1 print(f"Successed add paths for {reports_success} reports.")
def train(reports_path: str, save_path: str, model_name: Optional[str]): reports = [] for file_name in iterate_reports(reports_path): report_path = os.path.join(reports_path, file_name) report = Report.load_report(report_path) if report.frames: if sum(frame.meta["label"] for frame in report.frames) > 0: reports.append(report) reports = reports target = make_target(reports) with open("config.json", "r") as f: config = json.load(f) model_names = ["scuffle", "deep_analyze", "code2seq"] if model_name is not None: if model_name == "scuffle": encoder = ScuffleReportEncoder(**config["models"]["scuffle"]["encoder"]).fit(reports, target) tagger = LstmTagger( encoder, max_len=config["training"]["max_len"], **config["models"]["scuffle"]["tagger"] ) elif model_name == "deep_analyze": encoder = TfIdfReportEncoder(**config["models"]["deep_analyze"]["encoder"]).fit(reports, target) tagger = LstmTagger( encoder, max_len=config["training"]["max_len"], **config["models"]["deep_analyze"]["tagger"] ) elif model_name == "code2seq": config_path = config["code2seq_config_path"] cli_path = config["astminer_config_path"] ast_config_path = config["astminer_config_path"] __config = cast(DictConfig, OmegaConf.load(config_path)) code2seq = Code2Seq.load_from_checkpoint(__config.checkpoint, map_location=torch.device("cpu")) storage = LabeledPathContextStorage(cli_path, ast_config_path, code2seq.vocabulary, __config, **config["code2seq_storage"]) storage.load_data(reports, mine_files=False, process_mined=False, remove_all=False) encoder = Code2SeqReportEncoder(code2seq, storage) tagger = LstmTagger( encoder, max_len=config["training"]["max_len"], layers_num=2, hidden_dim=200 ) else: raise ValueError(f"Wrong model type. Should be in {model_names}") else: encoder = RobertaReportEncoder(frames_count=config["training"]["max_len"], device='cuda') tagger = LstmTagger( encoder, max_len=config["training"]["max_len"], layers_num=2, hidden_dim=250 ) tagger = train_lstm_tagger(tagger, reports, target, **config["training"]) return tagger
def read_report(report_path: str) -> Tuple[Report, List[int]]: report = Report.load_report(report_path) target = [frame.meta["label"] for frame in report.frames] return report, target