Ejemplo n.º 1
0
def load_labelprops(split):
    if split == "valid":
        split = "train"  # kfold valid and train are the same set
    return {
        issue: np.array(labelprops)
        for issue, labelprops in load_json(
            join(_LABELPROPS_DIR, f"{split}.json")).items()
    }
Ejemplo n.º 2
0
def load_all_arxiv_abstract_samples(categories: List[str],
                                    split: str) -> List[DataSample]:
    assert split in ["train", "valid", "test"]

    samples = []
    for c in tqdm(categories):
        ids = load_json(join(DATA_DIR, "arxiv", "splits", f"{c}.{split}.json"))
        raw_data = load_json(join(DATA_DIR, "arxiv", f"{c}.json"))

        for id in ids:
            samples.append(
                DataSample(
                    id=id,
                    text=raw_data[id]["abstract"],
                    y_idx=year2yidx(raw_data[id]["year"]),
                    domain_name=c,
                    domain_idx=ARXIV_CATEGORY2IDX[c],
                ))
    return samples
Ejemplo n.º 3
0
def load_all_amazon_review_samples(categories: List[str],
                                   split: str) -> List[DataSample]:
    assert split in ["train", "valid", "test"]

    samples = []
    for c in tqdm(categories):
        ids = load_json(
            join(DATA_DIR, "amazon_subsampled", "splits", f"{c}.{split}.json"))
        raw_data = load_json(join(DATA_DIR, "amazon_subsampled", f"{c}.json"))

        for id in ids:
            samples.append(
                DataSample(
                    id=id,
                    text=raw_data[id]["reviewText"],
                    # rating=raw_data[id]["overall"],
                    y_idx=rating_to_ridx(raw_data[id]["overall"]),
                    domain_name=c,
                    domain_idx=CATEGORY2CIDX[c],
                ))
    return samples
Ejemplo n.º 4
0
def load_metrics(model_root):
    nsample2acc = {}
    nsample2accs = {}
    metrics = load_json(join(model_root, "mean_metrics.json"))
    for nsample in ROBERTA_ADAPT_N_SAMPLES:
        accs = []
        for source in _DATADEF.domain_names:
            accs.append(
                metrics[f"{nsample:04}_samples"][source]["mean"]["valid_f1.best"]
            )
        nsample2accs[nsample] = np.array(accs)
        nsample2acc[nsample] = np.array(accs).mean()
    return nsample2acc, nsample2accs
Ejemplo n.º 5
0
def load_all_framing_samples(issues: List[str], split: str,
                             task: str) -> List[DataSample]:
    assert split in ["train", "test"]

    samples = []
    for issue in tqdm(issues):
        ids = load_json(
            join(DATA_DIR, "framing_labeled",
                 f"{issue}_{split}_sets.json"))[task]
        raw_data = load_json(
            join(DATA_DIR, "framing_labeled", f"{issue}_labeled.json"))

        for id in ids:
            samples.append(
                DataSample(
                    id=id,
                    text=remove_framing_text_headings(raw_data[id]["text"]),
                    y_idx=code_to_yidx(raw_data[id][task], task),
                    domain_name=issue,
                    domain_idx=ISSUE2IIDX[issue],
                ))
    return samples
Ejemplo n.º 6
0
def load_kfold_framing_samples(issues: List[str],
                               task: str) -> List[Dict[str, List[DataSample]]]:
    kidx2split2samples = [{"train": [], "valid": []} for _ in range(KFOLD)]

    samples = load_all_framing_samples(issues, split="train", task=task)
    for issue in tqdm(issues):
        kfold_data = load_json(
            join(DATA_DIR, "framing_labeled", f"{KFOLD}fold", f"{issue}.json"))
        for kidx, fold in enumerate(kfold_data[task]):
            for split in ["train", "valid"]:
                ids = set(fold[split])
                selected_samples = [s for s in samples if s.id in ids]
                kidx2split2samples[kidx][split].extend(selected_samples)
    return kidx2split2samples
Ejemplo n.º 7
0
            accs.append(
                metrics[f"{nsample:04}_samples"][source]["mean"]["valid_f1.best"]
            )
        nsample2accs[nsample] = np.array(accs)
        nsample2acc[nsample] = np.array(accs).mean()
    return nsample2acc, nsample2accs


holdout_adapt_nsample2acc, holdout_adapt_nsample2accs = load_metrics(
    _HOLDOUT_ADAPT_MODEL_ROOT
)
from_scratch_nsample2acc, from_scratch_nsample2accs = load_metrics(
    _FROM_SCRATCH_MODEL_ROOT
)

holdout_source_metrics = load_json(join(_HOLDOUT_SOUCE_MODEL_ROOT, "mean_metrics.json"))
holdout_source_acc = np.array(
    [
        holdout_source_metrics[source]["mean"]["valid_f1.best"]
        for source in _DATADEF.domain_names
    ]
).mean()


_PLOT_SAVE_PATH = join(_SAVE_DIR, f"{_ROBERTA_ARCH}.png")
plt.clf()
plt.figure(figsize=(7, 5))
plt.plot(
    ROBERTA_ADAPT_N_SAMPLES,
    [holdout_adapt_nsample2acc[nsample] for nsample in ROBERTA_ADAPT_N_SAMPLES],
    marker="D",
Ejemplo n.º 8
0
import pandas as pd
from config import DATA_DIR, RANDOM_SEED
from modapt.utils import ParallelHandler, load_json, save_json
from tqdm import tqdm

_TRAIN_PROP, _VALID_PROP, _TEST_PROP = [0.8, 0.1, 0.1]

_SRC_DATA_DIR = join(DATA_DIR, "sentiment")
_SPLITS_DIR = join(_SRC_DATA_DIR, "splits")

RNG = Random()
RNG.seed(RANDOM_SEED)

makedirs(_SPLITS_DIR, exist_ok=True)

raw_data_paths = sorted(glob.glob(join(_SRC_DATA_DIR, "*.json")))
for p in raw_data_paths:
    name = splitext(basename(p))[0]
    samples = load_json(p)
    ids = list(samples.keys())
    RNG.shuffle(ids)

    nsample = len(ids)
    n_train = int(nsample * _TRAIN_PROP)
    n_valid = int(nsample * _VALID_PROP)
    n_test = nsample - n_train - n_valid
    save_json(ids[:n_train], join(_SPLITS_DIR, f"{name}.train.json"))
    save_json(ids[n_train:n_train + n_valid],
              join(_SPLITS_DIR, f"{name}.valid.json"))
    save_json(ids[-n_test:], join(_SPLITS_DIR, f"{name}.test.json"))
Ejemplo n.º 9
0
from os.path import join

import pandas as pd
from config import DATA_DIR
from experiments.datadef import zoo
from modapt.utils import load_json, save_json

FRAMING_DATA_DIR = join(DATA_DIR, "framing_labeled")
ISSUES = zoo.get_datadef("framing").domain_names

if __name__ == "__main__":
    stats = []

    for issue in ISSUES:
        print(">>", issue)
        data = load_json(join(FRAMING_DATA_DIR, f"{issue}_labeled.json"))
        ids = list(data.keys())

        testsets = load_json(join(FRAMING_DATA_DIR, f"{issue}_test_sets.json"))
        testsets = {setname: set(ids) for setname, ids in testsets.items()}

        trainsets = {}
        # relevance train set: any sample not in test set relevance
        trainsets["relevance"] = list({
            id
            for id in data if (id in ids and id not in testsets["relevance"])
        })

        # primary frame trainset: any sample not in testset primary frame, and has non null primary fram
        trainsets["primary_frame"] = list({
            id
Ejemplo n.º 10
0
    if arch.startswith("roberta"):
        valid_roberta_model(arch)
    if arch.startswith("logreg"):
        valid_logreg_model(arch)

_OUTPUT_SAVE_DIR = join(OUTPUT_DIR, "power_analysis")
makedirs(_OUTPUT_SAVE_DIR, exist_ok=True)

# build tables
holdout_source_to_table = {}
fulltable = np.zeros((2, 2))
for holdout_source in _DATADEF.domain_names:
    table = np.zeros((2, 2))
    arch1_preds = load_json(
        join(
            _get_model_dir(_ARCH1),
            f"{_VALID_OR_TEST}_preds",
            f"{holdout_source}.json",
        ))
    arch2_preds = load_json(
        join(
            _get_model_dir(_ARCH2),
            f"{_VALID_OR_TEST}_preds",
            f"{holdout_source}.json",
        ))
    ids = list(arch1_preds.keys())
    for id in ids:
        arch1_correct = arch1_preds[id]["correct"]
        arch2_correct = arch2_preds[id]["correct"]
        if arch1_correct and arch2_correct:
            table[0][0] += 1
            fulltable[0][0] += 1
Ejemplo n.º 11
0
def load_labelprops(split):
    return {
        issue: np.array(labelprops)
        for issue, labelprops in load_json(
            join(_LABELPROPS_DIR, f"{split}.json")).items()
    }
Ejemplo n.º 12
0
        ) for source in _DATADEF.domain_names
    ])


for datasetname in _DATASETS:
    rows = {}
    _DATADEF = get_datadef(datasetname)

    source2labelprops = _DATADEF.load_labelprops_func(
        "train" if "test" not in _METRIC_FILENAME else "test")
    # acc if naively choose most common label from each source
    most_common_acc = np.array([a.max()
                                for a in source2labelprops.values()]).mean()
    rows["most_common"] = {"acc": round(most_common_acc, 3), "delta_std": "-"}

    lexicon_metrics = load_json(
        join(LEXICON_DIR, datasetname, _EXP_NAME, _METRIC_FILENAME))
    lexicon_base_accs = np.array([
        lexicon_metrics[_LEXICON_BASE_ARCH][source]["mean"]["valid_f1"]
        for source in _DATADEF.domain_names
    ])
    rows[_LEXICON_BASE_ARCH] = {
        "acc": round(lexicon_base_accs.mean(), 3),
        "delta_std": "-",
    }
    for arch in _LEXICON_ARCHS:
        accs = np.array([
            lexicon_metrics[arch][source]["mean"]["valid_f1"]
            for source in _DATADEF.domain_names
        ])
        delta = accs - lexicon_base_accs
        rows[arch] = {
Ejemplo n.º 13
0
    return [[firsthalf, secondhalf], [secondhalf, firsthalf]]


# load samples, shuffle once, use this seeded shuffle order for all evals

source2samples = {}
for source in _DATADEF.domain_names:
    source2samples[source] = _DATADEF.load_splits_func([source], ["train"])["train"]
    _RNG.shuffle(source2samples[source])

# lexicon model predicting with gt & estimated labelprops

_LEXICON_MODEL_PERFORMANCE_SAVE_PATH = join(_SAVE_DIR, f"{_LEXICON_ARCH}.json")

if not exists(_LEXICON_MODEL_PERFORMANCE_SAVE_PATH):
    orig_metrics = load_json(join(_LEXICON_MODEL_ROOT, "mean_metrics.json"))
    gt_source2acc = {
        source: orig_metrics[source]["mean"]["valid_f1"]
        for source in _DATADEF.domain_names
    }
    gt_source2acc["mean"] = np.array(list(gt_source2acc.values())).mean()
    gt_source2acc["std"] = np.array(list(gt_source2acc.values())).std()

    notechnique_metrics = load_json(
        join(
            LEXICON_DIR, _DATASET_NAME, "holdout_source", "logreg", "mean_metrics.json"
        )
    )
    notechnique_source2acc = {
        source: notechnique_metrics[source]["mean"]["valid_f1"]
        for source in _DATADEF.domain_names