Exemple #1
0
def valid_roberta_model(arch):
    print(">>", arch)

    model_dir = join(MODELS_DIR, _DATASET_NAME, "holdout_source", arch)
    save_preds_dir = join(model_dir, f"{_VALID_OR_TEST}_preds")
    makedirs(save_preds_dir, exist_ok=True)

    for holdout_source in _DATADEF.domain_names:
        print(">>>>", holdout_source)
        save_preds_path = join(model_dir, f"{_VALID_OR_TEST}_preds",
                               f"{holdout_source}.json")
        if exists(save_preds_path):
            continue

        # valid using holdout issue all samples
        valid_samples = _DATADEF.load_splits_func(
            [holdout_source], [_LOAD_SPLIT_NAME])[_LOAD_SPLIT_NAME]
        valid_dataset = RobertaDataset(
            valid_samples,
            n_classes=_DATADEF.n_classes,
            domain_names=_DATADEF.domain_names,
            source2labelprops=_DATADEF.load_labelprops_func(_LOAD_SPLIT_NAME),
        )
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=150,
            shuffle=True,
            num_workers=6,
        )

        checkpoint_path = join(model_dir, holdout_source, "checkpoint.pth")
        model = torch.load(checkpoint_path).to(AUTO_DEVICE)
        model.eval()

        id2results = {}
        with torch.no_grad():
            for batch in tqdm(valid_loader):
                outputs = model(batch)
                logits = outputs["logits"].detach().cpu().numpy()
                preds = np.argmax(logits, axis=1)
                labels = outputs["labels"].detach().cpu().numpy()
                ids = batch["id"]
                for id, pred, label in zip(ids, preds, labels):
                    id2results[id] = {
                        "pred": int(pred),
                        "label": int(label),
                        "correct": bool(pred == label),
                    }
        save_json(id2results, save_preds_path)
Exemple #2
0
def valid_logreg_model(arch):
    print(">>", arch)
    config = load_logreg_model_config_all_archs(_DATADEF.n_classes,
                                                _DATADEF.n_sources)[arch]

    model_dir = join(LEXICON_DIR, _DATASET_NAME, "holdout_source", arch)
    save_preds_dir = join(model_dir, f"{_VALID_OR_TEST}_preds")
    makedirs(save_preds_dir, exist_ok=True)

    for holdout_source in _DATADEF.domain_names:
        print(">>>>", holdout_source)
        save_preds_path = join(model_dir, f"{_VALID_OR_TEST}_preds",
                               f"{holdout_source}.json")
        if exists(save_preds_path):
            continue

        # valid using holdout issue all samples
        valid_samples = _DATADEF.load_splits_func(
            [holdout_source], [_LOAD_SPLIT_NAME])[_LOAD_SPLIT_NAME]
        model = torch.load(join(model_dir, holdout_source, "model.pth"))
        batch = build_bow_full_batch(
            valid_samples,
            _DATADEF,
            get_all_tokens(valid_samples),
            read_txt_as_str_list(join(model_dir, holdout_source, "vocab.txt")),
            use_source_individual_norm=config["use_source_individual_norm"],
            labelprop_split=_LOAD_SPLIT_NAME,
        )

        model.eval()
        with torch.no_grad():
            outputs = model(batch)

        logits = outputs["logits"].detach().cpu().numpy()
        preds = np.argmax(logits, axis=1)
        labels = outputs["labels"].detach().cpu().numpy()
        ids = [s.id for s in valid_samples]

        id2results = {}
        for id, pred, label in zip(ids, preds, labels):
            id2results[id] = {
                "pred": int(pred),
                "label": int(label),
                "correct": bool(pred == label),
            }
        save_json(id2results, save_preds_path)
Exemple #3
0
def process_category(p):
    category_name = basename(p).split("_5")[0]
    # print(category_name)

    with gzip.open(p, "r") as g:
        lines = [l for l in g]

    n_lines = len(lines)
    n_samples_to_keep = int(n_lines * _SUBSAMPLE_PROP)
    samples = {}

    while len(samples) < n_samples_to_keep:
        idx = randint(0, n_lines)
        l = lines[idx]
        s = json.loads(l)
        if not all(k in s for k in _KEEP_KEYS):
            continue
        sample = {k: s[k] for k in _KEEP_KEYS}
        sample_id = f"{s['asin']}.{s['reviewerID']}"
        samples[sample_id] = sample

    save_json(samples, join(_DST_DATA_DIR, f"{category_name}.json"))

    all_sample_ids = list(samples.keys())
    shuffle(all_sample_ids)
    n_train = int(n_samples_to_keep * _TRAIN_PROP)
    n_valid = int(n_samples_to_keep * _VALID_PROP)
    n_test = n_samples_to_keep - n_train - n_valid
    save_json(all_sample_ids[:n_train],
              join(_SPLITS_DIR, f"{category_name}.train.json"))
    save_json(
        all_sample_ids[n_train:n_train + n_valid],
        join(_SPLITS_DIR, f"{category_name}.valid.json"),
    )
    save_json(all_sample_ids[-n_test:],
              join(_SPLITS_DIR, f"{category_name}.test.json"))
    load_all_framing_samples,
)
from modapt.dataset.common import calculate_labelprops
from modapt.utils import save_json

makedirs(_LABELPROPS_DIR, exist_ok=True)

# primary frame
for split in ["train", "test"]:
    samples = load_all_framing_samples(ISSUES, split, "primary_frame")
    source2labelprops = calculate_labelprops(samples, len(PRIMARY_FRAME_NAMES),
                                             ISSUES)
    save_json(
        {
            issue: labelprops.tolist()
            for issue, labelprops in source2labelprops.items()
        },
        join(_LABELPROPS_DIR, f"primary_frame.{split}.json"),
    )

# primary tone
for split in ["train", "test"]:
    samples = load_all_framing_samples(ISSUES, split, "primary_tone")
    source2labelprops = calculate_labelprops(samples, len(PRIMARY_TONE_NAMES),
                                             ISSUES)
    save_json(
        {
            issue: labelprops.tolist()
            for issue, labelprops in source2labelprops.items()
        },
        join(_LABELPROPS_DIR, f"primary_tone.{split}.json"),
Exemple #5
0
for holdout_source in _DATADEF.domain_names:
    print(">>", holdout_source)
    logdir = join(_SAVE_DIR, holdout_source)
    makedirs(logdir, exist_ok=True)

    # valid using holdout issue all samples
    valid_samples = _DATADEF.load_splits_func([holdout_source],
                                              ["train"])["train"]

    num_correct = 0
    for s in tqdm(valid_samples):
        text = " ".join(get_tokens(s.text))
        score = _ANALYZER.polarity_scores(text)["compound"]
        is_correct = ((score > 0 and s.y_idx == 1)
                      or (score < 0 and s.y_idx == 0)
                      or (score == 0 and RNG.uniform(0, 1) > 0.5
                          )  # random break tie
                      )
        if is_correct:
            num_correct += 1
    acc = num_correct / len(valid_samples)

    metrics = {
        "valid_f1": acc,
        "valid_precision": acc,
        "valid_recall": acc,
    }
    save_json(metrics, join(logdir, "leaf_metrics.json"))

reduce_and_save_metrics(dirname(_SAVE_DIR))
_SUBSAMPLE_SIZE = 10000
_POLARITY_TO_LABEL = {
    "positive": "pos",
    # "neutral": "pos",  # call neutral positive for balance
    "negative": "neg",
}

df = pd.read_csv(join(DATA_DIR, "sentiment", "raw", "airline", "Tweets.csv"), )

idxs = RNG.sample(range(len(df)), _SUBSAMPLE_SIZE)

dataset_dict = {}

for idx in tqdm(idxs):
    row = df.iloc[idx]
    text = row[10]
    polarity = row[1]
    if polarity not in _POLARITY_TO_LABEL:
        continue
    polarity = _POLARITY_TO_LABEL[polarity]
    tweet_id = row[0]

    print(tweet_id, polarity, text)

    new_id = f"airline.{tweet_id}"

    dataset_dict[new_id] = {"id": new_id, "text": text, "polarity": polarity}

save_json(dataset_dict, join(DATA_DIR, "sentiment", "airline.json"))
Exemple #7
0
    use_lemmatize = config["use_lemmatize"]

    metrics = {}

    # run validation set
    valid_metrics = eval_lexicon_model(
        model=model,
        datadef=_DATADEF,
        valid_samples=valid_samples,
        vocab=vocab,
        use_source_individual_norm=use_source_individual_norm,
        use_lemmatize=use_lemmatize,
        labelprop_split="train",
    )
    metrics.update(valid_metrics)
    save_json(metrics, join(logdir, "leaf_metrics.json"))
    write_str_list_as_txt(vocab, join(logdir, "vocab.txt"))
    torch.save(model, join(logdir, "model.pth"))

    # run test set
    test_samples = _DATADEF.load_splits_func([holdout_source],
                                             ["test"])["test"]
    test_metrics = eval_lexicon_model(
        model,
        _DATADEF,
        test_samples,
        vocab,
        use_source_individual_norm=config["use_source_individual_norm"],
        use_lemmatize=False,
        labelprop_split="test",
    )
RNG = Random()
RNG.seed(RANDOM_SEED)

_SUBSAMPLE_SIZE = 10000
_POLARITY_TO_LABEL = {"positive": "pos", "negative": "neg"}

df = pd.read_csv(
    join(DATA_DIR, "sentiment", "raw", "imdb", "IMDB Dataset.csv"), )

idxs = RNG.sample(range(len(df)), _SUBSAMPLE_SIZE)

dataset_dict = {}

for idx in tqdm(idxs):
    row = df.iloc[idx]
    text = row[0]
    polarity = row[1]
    polarity = _POLARITY_TO_LABEL[polarity]

    hasher = hashlib.sha1(text.encode())
    review_id = base64.urlsafe_b64encode(hasher.digest()[:6]).decode()

    print(review_id, polarity, text)

    new_id = f"imdb.{review_id}"

    dataset_dict[new_id] = {"id": new_id, "text": text, "polarity": polarity}

save_json(dataset_dict, join(DATA_DIR, "sentiment", "imdb.json"))
import pandas as pd
from config import DATA_DIR, RANDOM_SEED
from modapt.utils import ParallelHandler, load_json, save_json
from tqdm import tqdm

_TRAIN_PROP, _VALID_PROP, _TEST_PROP = [0.8, 0.1, 0.1]

_SRC_DATA_DIR = join(DATA_DIR, "sentiment")
_SPLITS_DIR = join(_SRC_DATA_DIR, "splits")

RNG = Random()
RNG.seed(RANDOM_SEED)

makedirs(_SPLITS_DIR, exist_ok=True)

raw_data_paths = sorted(glob.glob(join(_SRC_DATA_DIR, "*.json")))
for p in raw_data_paths:
    name = splitext(basename(p))[0]
    samples = load_json(p)
    ids = list(samples.keys())
    RNG.shuffle(ids)

    nsample = len(ids)
    n_train = int(nsample * _TRAIN_PROP)
    n_valid = int(nsample * _VALID_PROP)
    n_test = nsample - n_train - n_valid
    save_json(ids[:n_train], join(_SPLITS_DIR, f"{name}.train.json"))
    save_json(ids[n_train:n_train + n_valid],
              join(_SPLITS_DIR, f"{name}.valid.json"))
    save_json(ids[-n_test:], join(_SPLITS_DIR, f"{name}.test.json"))
    # 3: "neg",
    4: "pos",
    5: "pos",
}

idxs = set(RNG.sample(range(_TOTAL_SAMPLES), _SUBSAMPLE_SIZE))
samples = []

g = gzip.open(_PATH, "r")
for i, l in enumerate(tqdm(g)):
    if i > _TOTAL_SAMPLES:
        break
    if i in idxs:
        samples.append(eval(l))

dataset_dict = {}

for sample in tqdm(samples):

    text = sample["reviewText"]
    rating = int(sample["overall"])
    if rating not in _RATING_TO_LABEL:
        continue

    polarity = _RATING_TO_LABEL[rating]
    new_id = f"amazon.{sample['asin']}-{sample['reviewerID']}"

    dataset_dict[new_id] = {"id": new_id, "text": text, "polarity": polarity}

save_json(dataset_dict, join(DATA_DIR, "sentiment", "amazon.json"))
            model,
            _DATADEF,
            train_samples,
            vocab,
            use_source_individual_norm,
            use_lemmatize,
            labelprop_split="train",
        )
        metrics.update(train_metrics)

        # run validation set
        valid_metrics = eval_lexicon_model(
            model=model,
            datadef=_DATADEF,
            valid_samples=valid_samples,
            vocab=vocab,
            use_source_individual_norm=use_source_individual_norm,
            use_lemmatize=use_lemmatize,
            labelprop_split="train",
        )
        metrics.update(valid_metrics)
        save_json(metrics, join(trial_logdir, "leaf_metrics.json"))
        write_str_list_as_txt(vocab, join(trial_logdir, "vocab.txt"))
        torch.save(model, join(trial_logdir, "model.pth"))


save_json(config, join(_SAVE_DIR, "config.json"))

reduce_and_save_metrics(dirname(_SAVE_DIR))
reduce_and_save_metrics(dirname(_SAVE_DIR), "leaf_test.json", "mean_test.json")
Exemple #12
0
    }

# count per years per category
cat2years = defaultdict(list)
for cat, id2sample in cat2id2sample.items():
    for sample in id2sample.values():
        cat2years[cat].append(sample["year"])
fig, axs = plt.subplots(nrows=1,
                        ncols=len(cat2years),
                        figsize=(5 * len(cat2years), 5))
for ax, cat in zip(axs, cat2years):
    ax.hist(cat2years[cat])
    ax.set_title(cat)
plt.savefig(join(_DST_DATA_DIR, "years.png"))

makedirs(_SPLITS_DIR, exist_ok=True)
for cat, id2sample in cat2id2sample.items():
    save_json(id2sample, join(_DST_DATA_DIR, f"{cat}.json"))

    all_ids = list(id2sample.keys())
    shuffle(all_ids)
    n_train = int(len(all_ids) * _TRAIN_PROP)
    n_valid = int(len(all_ids) * _VALID_PROP)
    n_test = len(all_ids) - n_train - n_valid
    save_json(all_ids[:n_train], join(_SPLITS_DIR, f"{cat}.train.json"))
    save_json(
        all_ids[n_train:n_train + n_valid],
        join(_SPLITS_DIR, f"{cat}.valid.json"),
    )
    save_json(all_ids[-n_test:], join(_SPLITS_DIR, f"{cat}.test.json"))
Exemple #13
0
reduce_and_save_metrics(_SAVE_DIR)
for e in range(_N_TRAIN_EPOCH):
    reduce_and_save_metrics(_SAVE_DIR, f"leaf_epoch_{e}.json", f"mean_epoch_{e}.json")


# setup and run test set

for holdout_source in _DATADEF.domain_names:
    save_metric_path = join(_SAVE_DIR, holdout_source, "leaf_test.json")
    if exists(save_metric_path):
        print(">> skip test", holdout_source)
        continue
    else:
        print(">> test", holdout_source)

    test_samples = _DATADEF.load_splits_func([holdout_source], ["test"])["test"]
    test_dataset = RobertaDataset(
        test_samples,
        n_classes=_DATADEF.n_classes,
        domain_names=_DATADEF.domain_names,
        source2labelprops=_DATADEF.load_labelprops_func("test"),
    )

    checkpointpath = join(_SAVE_DIR, holdout_source, "checkpoint.pth")
    model = torch.load(checkpointpath).to(AUTO_DEVICE)
    test_metrics = do_valid(model, test_dataset)

    save_json(test_metrics, save_metric_path)

reduce_and_save_metrics(_SAVE_DIR, f"leaf_test.json", f"mean_test.json")
Exemple #14
0
        # primary frame trainset: any sample not in testset primary frame, and has non null primary fram
        trainsets["primary_frame"] = list({
            id
            for id, item in data.items()
            if (id in ids and id not in testsets["primary_frame"] and
                item["primary_frame"] != 0 and item["primary_frame"] != None)
        })

        # primary tone trainset: any sample not in testset primary tone, and has none null primary tone
        trainsets["primary_tone"] = list({
            id
            for id, item in data.items()
            if (id in ids and id not in testsets["primary_tone"]
                and item["primary_tone"] != 0 and item["primary_tone"] != None)
        })
        save_json(trainsets, join(FRAMING_DATA_DIR,
                                  f"{issue}_train_sets.json"))

        stat = {
            "raw": len(data),
        }
        stat.update({
            f"train_{setname}": len(ids)
            for setname, ids in trainsets.items()
        })
        stat.update(
            {f"test_{setname}": len(ids)
             for setname, ids in testsets.items()})
        stats.append(stat)

        for k, v in stat.items():
            print("--", k, v)
Exemple #15
0
for holdout_source in _DATADEF.domain_names:
    table = holdout_source_to_table[holdout_source]
    result = mcnemar(table.T)
    results[holdout_source] = {
        "pvalue": result.pvalue,
        "statistic": result.statistic,
    }
all_result = mcnemar(fulltable.T)
results["all"] = {
    "pvalue": all_result.pvalue,
    "statistic": all_result.statistic,
}
results["fulltable"] = fulltable.tolist()

save_json(
    results,
    join(_OUTPUT_SAVE_DIR, f"mcnemars_{_DATASET_NAME}.{_ARCH1}@{_ARCH2}.json"))
print("mcnemars p", results["all"]["pvalue"])


# card, power analysis
def compute_power(prob_table, dataset_size, alpha=0.05, r=5000):
    """
    Dallas Card et al. "With Little Power Comes Great Responsibility"
    https://colab.research.google.com/drive/1anaS-9ElouZhUgCAYQt8jy8qBiaXnnK1?usp=sharing#scrollTo=OCz-VAm_ifqZ
    """
    if prob_table[0, 1] == prob_table[1, 0]:
        raise RuntimeError("Power is undefined when the true effect is zero.")

    pvals = []
    diffs = []
from os import makedirs
from os.path import join

from experiments.datadef.definitions.amazon import (
    CATEGORIES,
    LABELPROPS_DIR,
    RATING_N_CLASSES,
)
from modapt.dataset.amazon.samples import load_all_amazon_review_samples
from modapt.dataset.common import calculate_labelprops
from modapt.utils import save_json

makedirs(LABELPROPS_DIR, exist_ok=True)

for split in ["train", "valid", "test"]:
    samples = load_all_amazon_review_samples(CATEGORIES, split)
    source2labelprops = calculate_labelprops(samples, RATING_N_CLASSES,
                                             CATEGORIES)
    save_json(
        {
            issue: labelprops.tolist()
            for issue, labelprops in source2labelprops.items()
        },
        join(LABELPROPS_DIR, f"{split}.json"),
    )
            config,
            _DATADEF,
            train_samples=train_samples,
            valid_samples=valid_samples,
            vocab_size=config["vocab_size"],
            logdir=join(savedir, train_source),
            train_labelprop_split="train",
            valid_labelprop_split="train",
        )

        model = torch.load(join(savedir, train_source, "model.pth"))
        vocab = read_txt_as_str_list(join(savedir, train_source, "vocab.txt"))

        test_samples = _DATADEF.load_splits_func(holdout_sources,
                                                 ["test"])["test"]
        test_metrics = eval_lexicon_model(
            model,
            _DATADEF,
            test_samples,
            vocab,
            use_lemmatize=False,
            use_source_individual_norm=config["use_source_individual_norm"],
            labelprop_split="test",
        )
        save_json(test_metrics, join(savedir, train_source, "leaf_test.json"))

    save_json(config, join(savedir, "config.json"))

reduce_and_save_metrics(_SAVE_ROOT)
reduce_and_save_metrics(_SAVE_ROOT, "leaf_test.json", "mean_test.json")
Exemple #18
0
                )
                return metrics["valid_f1"]

            for ti in range(_N_TRIALS):
                selected_sample = all_samples[ti * nsample : (ti + 1) * nsample]

                for label_est_samples, valid_samples in _2fold(selected_sample):
                    acc = _eval_lex_model(label_est_samples, valid_samples)
                    source2type2accs[source]["selected"].append(acc)

                fullacc = _eval_lex_model(selected_sample, all_samples)
                source2type2accs[source]["full"].append(fullacc)

        lexicon_model_perf[str(nsample)] = dict(source2type2accs)

    save_json(lexicon_model_perf, _LEXICON_MODEL_PERFORMANCE_SAVE_PATH)
else:
    lexicon_model_perf = load_json(_LEXICON_MODEL_PERFORMANCE_SAVE_PATH)

lexicon_model_stats = {}
for nsample in _LABELPROPS_ESTIMATE_NSAMPLES:
    accs = []
    deltas = []
    for source in _DATADEF.domain_names:
        source_technique_mean = np.array(
            lexicon_model_perf[str(nsample)][source]["full"]
        ).mean()
        source_base_mean = lexicon_model_perf["no_technique"][source]
        accs.append(source_technique_mean)
        deltas.append(source_technique_mean - source_base_mean)
    lexicon_model_stats[str(nsample)] = {