Esempio n. 1
0
 def _eval_lex_model(label_est_samples, valid_samples) -> float:
     estimated_labelprops = {
         "estimated": calculate_labelprops(
             label_est_samples,
             _DATADEF.n_classes,
             _DATADEF.domain_names,
         )
     }
     datadef = get_datadef(_DATASET_NAME)
     datadef.load_labelprops_func = lambda _split: estimated_labelprops[
         _split
     ]
     metrics = eval_lexicon_model(
         model,
         datadef,
         valid_samples,
         vocab,
         use_source_individual_norm=_LEXICON_CONFIG[
             "use_source_individual_norm"
         ],
         labelprop_split="estimated",  # match _load_labelprops_func()
     )
     return metrics["valid_f1"]
import sys
from os.path import basename, join, realpath

import torch
from config import LEXICON_DIR
from experiments.datadef.zoo import get_datadef
from modapt.eval import reduce_and_save_metrics
from modapt.lexicon import eval_lexicon_model, run_lexicon_experiment
from modapt.model.logreg_config.grid_search import load_logreg_model_config_all_archs
from modapt.utils import read_txt_as_str_list, save_json

_DATASET_NAME = sys.argv[1]
_REG = float(sys.argv[2])

_DATADEF = get_datadef(_DATASET_NAME)

_SCRIPT_PATH = realpath(__file__)
_EXPERIMENT_NAME = basename(_SCRIPT_PATH).replace(".py", "")
_SAVE_ROOT = join(LEXICON_DIR, _DATASET_NAME, _EXPERIMENT_NAME)

_ARCH2CONFIG = load_logreg_model_config_all_archs(_DATADEF.n_classes,
                                                  _DATADEF.n_sources)
for arch, config in _ARCH2CONFIG.items():
    config["reg"] = _REG  # override with cmd arg

    print("\n")
    print("+" * 30)
    print(arch)
    print("+" * 30)
# py <script> <savedir>
import sys
from os import makedirs
from os.path import join
from random import Random

import pandas as pd
from config import RANDOM_SEED
from experiments.datadef.zoo import get_datadef
from modapt.dataset.common import to_df

_DATADEF = get_datadef("sentiment")
_SAVEDIR = sys.argv[1]

_N_TRAIN_SAMPLE_PER_DOMAIN = 100
_N_VALID_LABELED = 20
_N_VALID_UNLABELED = 20

_RNG = Random(RANDOM_SEED)
_TRAIN_DOMAINS = ["airline", "imdb", "senti140", "sst"]
_VALID_DOMAIN = "amazon"

makedirs(_SAVEDIR, exist_ok=True)

train_samples = []
for d in _TRAIN_DOMAINS:
    samples = _DATADEF.load_splits_func([d], ["train"])["train"]
    _RNG.shuffle(samples)
    train_samples.extend(samples[:_N_TRAIN_SAMPLE_PER_DOMAIN])
train_df = to_df(train_samples)
train_df.to_csv(join(_SAVEDIR, "train.csv"))
Esempio n. 4
0
from os.path import join

import pandas as pd
from config import DATA_DIR
from experiments.datadef import zoo
from modapt.utils import load_json, save_json

FRAMING_DATA_DIR = join(DATA_DIR, "framing_labeled")
ISSUES = zoo.get_datadef("framing").domain_names

if __name__ == "__main__":
    stats = []

    for issue in ISSUES:
        print(">>", issue)
        data = load_json(join(FRAMING_DATA_DIR, f"{issue}_labeled.json"))
        ids = list(data.keys())

        testsets = load_json(join(FRAMING_DATA_DIR, f"{issue}_test_sets.json"))
        testsets = {setname: set(ids) for setname, ids in testsets.items()}

        trainsets = {}
        # relevance train set: any sample not in test set relevance
        trainsets["relevance"] = list({
            id
            for id in data if (id in ids and id not in testsets["relevance"])
        })

        # primary frame trainset: any sample not in testset primary frame, and has non null primary fram
        trainsets["primary_frame"] = list({
            id
Esempio n. 5
0
]


def get_valid_accs(metrics):
    return np.array([
        max(
            metrics[source]["mean"].get("f1", 0),
            metrics[source]["mean"].get("valid_f1", 0),
            metrics[source]["mean"].get("valid_f1.best", 0),
        ) for source in _DATADEF.domain_names
    ])


for datasetname in _DATASETS:
    rows = {}
    _DATADEF = get_datadef(datasetname)

    source2labelprops = _DATADEF.load_labelprops_func(
        "train" if "test" not in _METRIC_FILENAME else "test")
    # acc if naively choose most common label from each source
    most_common_acc = np.array([a.max()
                                for a in source2labelprops.values()]).mean()
    rows["most_common"] = {"acc": round(most_common_acc, 3), "delta_std": "-"}

    lexicon_metrics = load_json(
        join(LEXICON_DIR, datasetname, _EXP_NAME, _METRIC_FILENAME))
    lexicon_base_accs = np.array([
        lexicon_metrics[_LEXICON_BASE_ARCH][source]["mean"]["valid_f1"]
        for source in _DATADEF.domain_names
    ])
    rows[_LEXICON_BASE_ARCH] = {