Example #1
0
def test_final_ds():
    acs = list(Path("data/labels").glob("*.json"))
    final_dsp = "out/standardDsTmp.zip"
    with TemporaryDirectory() as temp_dir:
        _, _, ac_dsps = replay_ac(acs, temp_dir)
        rawdata = load_datazip(ac_dsps[-1])
        rawdata_ref = load_datazip(final_dsp)
        assert rawdata.y_test_tags == rawdata_ref.y_test_tags
Example #2
0
def test_split_ref():
    final_dsp = "out/standardDsTmp.zip"
    src = "out/standard"
    with TemporaryDirectory() as temp_dir:
        dsps = mk_standardDs(final_dsp, temp_dir, plot=False)
        for idx, dsp in enumerate(dsps):
            rawdata = load_datazip(dsp)
            ref_rawdata = load_datazip(f"{src}/standardDs{idx}.zip")
            assert rawdata.y_test_tags == ref_rawdata.y_test_tags
Example #3
0
def train_eval(
    dsp,
    output_p,
    upsample=200,
    keep_key=True,
    over=5,
    cv=True,
):
    max_len = 150
    marker = Path(dsp).stem
    fn = f"{output_p}/{marker}tag_stat.csv"
    ds = load_datazip(dsp)
    tag_stat = rawdata_stat(ds)
    tag_stat.to_csv(fn)

    mlb = MultiLabelBinarizer().fit(ds.y_tags)
    if not cv:
        single_train_eval(ds, max_len, upsample, keep_key, over, mlb, output_p,
                          marker)
    else:
        for idx, tmp_ds in enumerate(kf_flow(ds)):
            single_train_eval(
                tmp_ds,
                max_len,
                upsample,
                keep_key,
                over,
                mlb,
                output_p,
                marker + f"cv{idx}",
            )
Example #4
0
def form_random_ds(
    standard_dsps: List[str],
    eval_ret="mona_j.csv",
    unlabelled_p="unlabelled.json",
    outdir=".",
    train_size=400,
):
    df = pd.read_csv(eval_ret).drop_duplicates(subset=["ID", "Judge"],
                                               keep="last")
    indices = df["ID"].to_list()
    sampled_cases = load_json(unlabelled_p)
    add_text = [sampled_cases[idx] for idx in indices]
    y_true_ = df["eval"].map(lambda x: x.split(", ")).to_list()
    dsps = []
    for i, base_path in enumerate(standard_dsps):
        ds = load_datazip(base_path)
        random_idx = random.sample(list(range(len(indices))), train_size)
        x_train_dict = [add_text[idx] for idx in random_idx]
        y_train_tags = [y_true_[idx] for idx in random_idx]
        ds.x_train_dict = x_train_dict
        ds.y_train_tags = y_train_tags
        ds.x_dict = x_train_dict + ds.x_test_dict
        ds.y_tags = y_train_tags + ds.y_test_tags

        dsp = dump_datazip(ds, f"{outdir}/randomDs{i}.zip")
        dsps.append(dsp)
        print(dsp)
    return dsps
Example #5
0
def test_random_ds():
    standard_dsps = [f"out/standard/standardDs{idx}.zip" for idx in range(3)]
    eval_ret = "data/evaluation/mona_j.csv"
    unlabelled_p = "data/unlabelled.json"
    src = "out/random"
    with TemporaryDirectory() as temp_dir:
        dsps = mk_randomDs(
            standard_dsps,
            eval_ret=eval_ret,
            unlabelled_p=unlabelled_p,
            dst=temp_dir,
            plot=False,
        )
        for idx, dsp in enumerate(dsps):
            rawdata = load_datazip(dsp)
            ref_rawdata = load_datazip(f"{src}/randomDs{idx}.zip")
            assert rawdata.y_test_tags == ref_rawdata.y_test_tags
Example #6
0
def mk_standardDs(final_dsp="standardDsTmp.zip", dst="out/standard", plot=False):
    dsps = dataset_split(final_dsp, dst)
    if plot:
        for idx, dsp in enumerate(dsps):
            rawdata = load_datazip(dsp)
            tag_stat = rawdata_stat(rawdata)
            tag_stat.to_csv(f"{dst}/data_stat{idx}.csv")
            fig = plot_tag_stat(tag_stat)
            fig.write_image(f"{dst}/data_stat{idx}.pdf")
    return dsps
Example #7
0
def test_prediction():
    model = StandaloneModel.from_path("TagModel", keep_key=False, max_len=100)
    cases = load_json("data/eval.json")
    ds = load_datazip("data/dataset.zip")
    mlb = MultiLabelBinarizer().fit(ds.y_tags)
    indexes = [9, 15]
    used = [cases[idx]["text"] for idx in indexes]
    prob = model.predict_prob(used, mlb)
    tags = model.predict_tags(used, mlb)
    pred_out = mlb.transform(tags)
    build_eval_json(used, prob, pred_out)
Example #8
0
def dataset_split(final_dsp, dst="."):
    ds = load_datazip(final_dsp)
    standard_dsps = []
    for i in range(3):
        output = f"{dst}/standardDs{i}.zip"
        split_and_dump_dataset(ds.x_dict,
                               ds.y_tags,
                               train_first=False,
                               output=output)
        standard_dsps.append(output)
    return standard_dsps
Example #9
0
def main(
    dataset_p: str,
    unlabelled_p: str,
    model_p: str,
    outdir: str,
    plot=True,
    train=False,
):
    if train:
        ds = load_datazip(dataset_p)
        train_main_model(ds, model_p=model_p)
    if plot:
        make_figures(model_p, dataset_p, unlabelled_p, dst=outdir)
Example #10
0
def mk_randomDs(
    standard_dsps,
    eval_ret="data/evaluation/mona_j.csv",
    unlabelled_p="data/unlabelled.json",
    dst="out/random",
    plot=False,
):
    dsps = form_random_ds(standard_dsps, eval_ret, unlabelled_p, outdir=dst)
    if plot:
        for idx, dsp in enumerate(dsps):
            rawdata = load_datazip(dsp)
            tag_stat = rawdata_stat(rawdata)
            tag_stat.to_csv(f"{dst}/random_data_stat{idx}.csv")
            fig = plot_tag_stat(tag_stat)
            fig.write_image(f"{dst}/random_data_stat{idx}.pdf")
    return dsps
Example #11
0
def run_experiment(
    output_p="output",
    dataset_path="stdDs.zip",
    upsample=200,
    keep_key=True,
    over=None,
    step=50,
    metrics_only=False,
):
    if over is None:
        over = -1 if upsample == -1 else 5
    output_p = f"{output_p}/{'keepKey_' if keep_key else 'noKey_'}{upsample}"
    os.makedirs(output_p, exist_ok=True)
    ds = load_datazip(dataset_path)
    mlb = MultiLabelBinarizer().fit(ds.y_tags)
    max_len = 150

    for size in range(step, len(ds.x_train_dict) + 1, step):
        if metrics_only:
            model = StandaloneModel.from_path(f"{output_p}/model",
                                              keep_key=keep_key,
                                              max_len=max_len)
            eval_model(model, ds, over, mlb, output_p, size)
        else:
            part_ds = slice_dataset(ds, size)

            params = Params(part_ds, max_len, upsample, 0.5,
                            "bert-base-uncased", keep_key, 10, mlb)
            print(len(part_ds.x_train_dict))
            pipeline = Pipeline(params)
            model_p = pipeline.train(output_dir=output_p)
            model = StandaloneModel(pipeline.model,
                                    pipeline.tokenizer,
                                    keep_key=keep_key,
                                    max_len=max_len)
            eval_model(model, part_ds, over, mlb, output_p, size)
            if size >= len(ds.x_train_dict):
                pipeline.trainer.save_model(f"{output_p}/model")
            del pipeline
            shutil.rmtree(model_p)
        del model
        gc.collect()
        with torch.no_grad():
            torch.cuda.empty_cache()
Example #12
0
    def _add_training(
        self,
        eval_ret: str,
        base_path,
        idx_marker=1,
    ):
        outdir = self.outdir
        unlabelled_p = self.unlabelled_p
        ds = load_datazip(base_path)
        df = pd.read_csv(eval_ret).drop_duplicates(subset=["ID", "Judge"],
                                                   keep="last")
        indices = df["ID"].to_list()
        sampled_cases = load_json(unlabelled_p)
        add_texts = [sampled_cases[idx] for idx in indices]
        y_true_ = df["eval"].map(lambda x: x.split(", ")).to_list()

        ds.x_train_dict = ds.x_train_dict + add_texts
        ds.y_train_tags = ds.y_train_tags + y_true_
        ds.x_dict = ds.x_dict + add_texts
        ds.y_tags = ds.y_tags + y_true_
        dsp = dump_datazip(ds, f"{outdir}/dataset{idx_marker}.zip")
        print(dsp)
        return ds
Example #13
0
    def __init__(
        self,
        init_model_p,
        eval_ret="mona_j.csv",
        dataset_p="stdDs.zip",
        ori_eval_p="outputsS/eval.json",
        unlabelled_p="outputsK/unlabelled.json",
        outdir="feedbackM",
    ):
        self.init_model_p = init_model_p
        self.eval_ret = eval_ret
        self.dataset_p = dataset_p
        self.ori_eval_p = ori_eval_p
        self.unlabelled_p = unlabelled_p
        self.outdir = outdir

        Path(self.outdir).mkdir(exist_ok=True, parents=True)
        shutil.copyfile(self.dataset_p, f"{outdir}/dataset0.zip")
        shutil.copyfile(self.ori_eval_p, f"{outdir}/eval0.json")
        self.ds = load_datazip(self.dataset_p)
        self.mlb = MultiLabelBinarizer().fit(self.ds.y_tags)
        self.df = pd.read_csv(eval_ret).drop_duplicates(subset=["ID", "Judge"],
                                                        keep="last")
Example #14
0
    model,
    mlb,
    unlabelled_p="outputsK/unlabelled.json",
    outdir="outputs",
    over=5,
    marker="",
    skip_state=False,
):
    sampled_cases = load_json(unlabelled_p)
    if not skip_state:
        sampled_state = get_unlabelled_state(model, sampled_cases, mlb)
        dump_state(sampled_state, state_p=f"{outdir}/unstate{marker}.pkl")
        unstate_df = dimension_reduction(sampled_state, "TSNE", n_components=2)
        unstate_df.to_csv(f"{outdir}/unlabel_tsne{marker}.csv")
        fig = state_plot(unstate_df, 12)
        fig.write_image(f"{outdir}/unlabelled_TSNE{marker}.pdf")
        fig.write_html(f"{outdir}/unlabel_tsne{marker}.html")
    preds = model.over_predict(sampled_cases, n=over)
    thresh_items = label_output(preds)
    pred_prob = [list(zip(mlb.classes_, pred)) for pred in preds]
    eval_json = build_eval_json(sampled_cases, pred_prob, thresh_items)
    dump_json(f"{outdir}/eval{marker}.json", eval_json)


if __name__ == "__main__":
    ds = load_datazip("dataset.zip")
    mlb = MultiLabelBinarizer().fit(ds.y_tags)
    model_p = "lab4/keepKey_200/model"
    model = StandaloneModel.from_path(model_p, keep_key=True, max_len=150)
    form_eval(model, mlb)
Example #15
0
def sample_evaluation_from_path(cases_p, dsp):
    all_cases = load_json(cases_p)
    dataset = load_datazip(dsp)
    return sample_evaluation(all_cases, dataset)
Example #16
0
def make_figures(model_p: str, dsp: str, unlabelled_p: str, dst="figF"):
    dataset = load_datazip(dsp)
    model = StandaloneModel.from_path(model_p, keep_key=True, max_len=150)
    over = 5
    mlb = MultiLabelBinarizer().fit(dataset.y_tags)
    os.makedirs(dst, exist_ok=True)
    # Rawdata_stat
    fn = f"{dst}/data_stat.csv"
    if os.path.exists(fn):
        tag_stat = pd.read_csv(fn, index_col=0)
    else:
        tag_stat = rawdata_stat(dataset)
        tag_stat.to_csv(fn)
    fig = plot_tag_stat(tag_stat)
    fig.write_image(f"{dst}/data_stat.pdf")

    # Unlabelled
    fn = f"{dst}/unlabel_tsne.csv"
    if os.path.exists(fn):
        unstate_df = pd.read_csv(fn, index_col=0)
    else:
        sampled_cases = load_json(unlabelled_p)
        sampled_state = get_unlabelled_state(model, sampled_cases, mlb)
        dump_state(sampled_state, state_p=f"{dst}/unstate.pkl")
        unstate_df = dimension_reduction(sampled_state, "TSNE", n_components=2)
        unstate_df.to_csv(fn)
        preds = model.over_predict(sampled_cases, n=over)
        thresh_items = label_output(preds)
        pred_prob = [list(zip(mlb.classes_, pred)) for pred in preds]
        eval_json = build_eval_json(sampled_cases, pred_prob, thresh_items)
        dump_json(f"{dst}/eval.json", eval_json)
    fig = state_plot(unstate_df, 12)
    fig.write_image(f"{dst}/unlabel_tsne.pdf")
    fig.write_html(f"{dst}/unlabel_tsne.html")

    # Labelled
    fn = f"{dst}/label_tsne.csv"
    if os.path.exists(fn):
        state_df = pd.read_csv(fn, index_col=0)
    else:
        states = get_tag_states(model, dataset, mlb)
        state_df = dimension_reduction(states, "TSNE", n_components=2)
        state_df.to_csv(fn)
    fig = state_plot(state_df, 12)
    fig.write_image(f"{dst}/label_tsne.pdf")
    fig.write_html(f"{dst}/label_tsne.html")

    # Performance
    eval_model(model, dataset, over, mlb, dst, "")

    # Top key
    fn = f"{dst}/top_key.json"
    if os.path.exists(fn):
        top_key = load_json(fn)
    else:
        maskExplainer = MaskExplainer(mlb)
        top = top_keywords(maskExplainer, model, dataset.x_dict)
        top_key = filter_top(top, dataset.y_tags, thresh=20)
        dump_json(fn, top_key)
    fig = kw_plot(top_key)
    fig.write_image(f"{dst}/knockout_result.pdf")
Example #17
0
def test_upsampling(dsp):
    ds = load_datazip(dsp)
    new_x1, _ = upsampling(ds.x_train_dict, ds.y_train_tags, target=200)
    new_x2, _ = upsampling(ds.x_train_dict, ds.y_train_tags, target=-200)
    print(len(new_x1))
    assert len(new_x1) == len(new_x2)