Exemple #1
0
def add_paths_to_one_report(report: Report, commit_files: Dict[str, List[str]],
                            file_limit: int) -> Report:
    frames_with_paths = []
    for frame in report.frames[:file_limit]:
        matching_files_for_frame = commit_files[frame.meta['file_name']]
        frame_path = find_file_for_frame(frame, matching_files_for_frame)
        frame_meta = frame.meta
        frame_meta['path'] = frame_path
        frames_with_paths.append(Frame(frame.code, frame_meta))

    return Report(report.id, report.exceptions, report.hash, frames_with_paths)
def label_frames(report: Report, commit_hash: str,
                 methods_info: pd.DataFrame) -> Report:
    fixed_methods = methods_info.fixed_method.values
    paths = methods_info.path.apply(lambda x: x.split('/')[-1])

    frames_with_labels = []
    for frame in report.frames:
        label = frame_label(frame, fixed_methods, paths)
        frame_meta = frame.meta
        frame_meta['label'] = label
        frames_with_labels.append(Frame(frame.code, frame_meta))

    return Report(report.id, report.exceptions, commit_hash,
                  frames_with_labels)
Exemple #3
0
def collect_sources_for_all_reports(repo: Repo,
                                    path_to_reports: str,
                                    file_limit=80):
    reports_success = 0
    for file_name in iterate_reports(path_to_reports):
        path_to_file = os.path.join(path_to_reports, file_name)
        report = Report.load_report(path_to_file)
        if report.hash == "":
            continue

        report = get_sources_for_report(repo, report, report.hash, file_limit)
        report.save_report(path_to_file)
        reports_success += 1

    print(f"Successed collect code data for {reports_success} reports.")
Exemple #4
0
def add_git_data(repo_path: str, data_dir: str, frame_limit: int):
    repo = Repo(repo_path, odbt=db.GitDB)

    path_to_reports = os.path.join(data_dir, REPORTS_INTERMEDIATE_DIR)
    reports_save_path = os.path.join(data_dir, FINAL_REPORTS_DIR)
    os.makedirs(reports_save_path, exist_ok=True)
    for file_name in iterate_reports(path_to_reports):
        report_path = os.path.join(path_to_reports, file_name)
        report = Report.load_report(report_path)
        report = add_git_data_to_frames(repo, report, frame_limit)

        report_save_path = os.path.join(reports_save_path, file_name)
        report.save_report(report_save_path)

    print(f"Successfully collect git data for.")
Exemple #5
0
def get_sources_for_report(repo: Repo, report: Report, commit: str,
                           file_limit: int) -> Report:
    frames_with_codes = []
    for frame in report.frames[:file_limit]:
        if frame.meta['path'] != '':
            diff_file = frame.meta['path']
            try:
                code = get_file_by_commit(repo, commit + "~1", diff_file)
                hashed_code = base64.b64encode(code.encode('UTF-8'))
                frame_code = hashed_code
                frames_with_codes.append(Frame(frame_code, frame.meta))
            except Exception:
                print(report.id, frame.meta['file_name'])

    return Report(report.id, report.exceptions, report.hash, frames_with_codes)
Exemple #6
0
def add_git_data_to_frames(repo: Repo,
                           report: Report,
                           frame_limit: int = 80) -> Report:
    commit_hash = report.hash
    frames = []
    for frame in report.frames[:frame_limit]:
        is_saved = False
        frame_meta = frame.meta
        if commit_hash and "path" in frame.meta and frame.meta['path'] != "":
            is_saved = save_commit_file_info(repo, frame, frame.meta['path'],
                                             commit_hash)
        if not is_saved:
            frame_meta['committed_date'] = 0
            frame_meta['authored_date'] = 0
            frame_meta['author'] = 'no_author'
        frames.append(Frame(frame.code, frame_meta))

    return Report(report.id, report.exceptions, report.hash, frames)
Exemple #7
0
def get_all_embeddings(data_dir: str, embs_name: str, files_limit: int = 80):
    embeddings = {}
    file_embeddings_cache = {}
    model = Code2Seq.load("java")
    path_to_reports = os.path.join(data_dir, FINAL_REPORTS_DIR)
    for file_name in iterate_reports(path_to_reports):
        report = Report.load_report(os.path.join(path_to_reports, file_name))
        commit_hash = report.hash
        for frame in report.frames:
            method_name = frame.meta["method_name"]
            method_key = (commit_hash, method_name)
            file_key = (commit_hash, frame.meta["path"])
            if file_key not in file_embeddings_cache:
                file_embeddings_cache[file_key] = get_file_embeddings(
                    model, frame.get_code_decoded())
            if method_key not in embeddings:
                embeddings[method_key] = get_method_embedding(
                    file_embeddings_cache[file_key], method_name)

    with open(os.path.join(data_dir, embs_name), 'w') as f:
        pickle.dump(embeddings, f)
def label_reports(issues_info: pd.DataFrame, path_to_reports: str,
                  path_to_reports_save: str):
    reports_success = 0
    for file_name in iterate_reports(path_to_reports, ext='.json'):
        path_to_file = os.path.join(path_to_reports, file_name)
        report = Report.load_from_base_report(path_to_file)

        fixed_methods = find_fixed_method_for_report(issues_info, report.id)
        report_hash = report.hash
        if fixed_methods.shape[0] != 0:
            report_hash = get_hash(report.id, issues_info)

        report = label_frames(report, report_hash, fixed_methods)

        reports_success += 1 if sum(
            [frame.meta['label'] for frame in report.frames]) else 0
        if report.id != 0:
            report.save_report(
                os.path.join(path_to_reports_save,
                             file_name.split('.')[0]))

    print(f"Successed label data for {reports_success} reports.")
Exemple #9
0
def add_paths_to_all_reports(from_repo: Repo,
                             path_to_reports: str,
                             path_to_reports_save: str,
                             file_limit: int = 80):
    reports_success = 0
    for file_name in iterate_reports(path_to_reports):
        path_to_file = os.path.join(path_to_reports, file_name)
        report = Report.load_report(path_to_file)
        if report.id == 0:
            continue
        try:
            commit = from_repo.commit(report.hash + '~1')
            commit_files = list_files_in_commit(commit, from_repo)
        except Exception:
            continue

        report = add_paths_to_one_report(report,
                                         commit_files,
                                         file_limit=file_limit)
        report.save_report(os.path.join(path_to_reports_save, file_name))
        reports_success += 1

    print(f"Successed add paths for {reports_success} reports.")
Exemple #10
0
def train(reports_path: str, save_path: str, model_name: Optional[str]):
    reports = []
    for file_name in iterate_reports(reports_path):
        report_path = os.path.join(reports_path, file_name)
        report = Report.load_report(report_path)
        if report.frames:
            if sum(frame.meta["label"] for frame in report.frames) > 0:
                reports.append(report)

    reports = reports

    target = make_target(reports)

    with open("config.json", "r") as f:
        config = json.load(f)

    model_names = ["scuffle", "deep_analyze", "code2seq"]

    if model_name is not None:
        if model_name == "scuffle":
            encoder = ScuffleReportEncoder(**config["models"]["scuffle"]["encoder"]).fit(reports, target)
            tagger = LstmTagger(
                encoder,
                max_len=config["training"]["max_len"],
                **config["models"]["scuffle"]["tagger"]
            )
        elif model_name == "deep_analyze":
            encoder = TfIdfReportEncoder(**config["models"]["deep_analyze"]["encoder"]).fit(reports, target)
            tagger = LstmTagger(
                encoder,
                max_len=config["training"]["max_len"],
                **config["models"]["deep_analyze"]["tagger"]
            )
        elif model_name == "code2seq":
            config_path = config["code2seq_config_path"]
            cli_path = config["astminer_config_path"]
            ast_config_path = config["astminer_config_path"]

            __config = cast(DictConfig, OmegaConf.load(config_path))

            code2seq = Code2Seq.load_from_checkpoint(__config.checkpoint, map_location=torch.device("cpu"))

            storage = LabeledPathContextStorage(cli_path, ast_config_path, code2seq.vocabulary, __config,
                                                **config["code2seq_storage"])

            storage.load_data(reports, mine_files=False, process_mined=False, remove_all=False)

            encoder = Code2SeqReportEncoder(code2seq, storage)

            tagger = LstmTagger(
                encoder,
                max_len=config["training"]["max_len"],
                layers_num=2,
                hidden_dim=200
            )

        else:
            raise ValueError(f"Wrong model type. Should be in {model_names}")
    else:

        encoder = RobertaReportEncoder(frames_count=config["training"]["max_len"], device='cuda')

        tagger = LstmTagger(
            encoder,
            max_len=config["training"]["max_len"],
            layers_num=2,
            hidden_dim=250
        )

    tagger = train_lstm_tagger(tagger, reports, target, **config["training"])

    return tagger
Exemple #11
0
def read_report(report_path: str) -> Tuple[Report, List[int]]:
    report = Report.load_report(report_path)
    target = [frame.meta["label"] for frame in report.frames]

    return report, target