Example #1
0
def get_nltk_data():
    """Download nltk data to ``<fibber_root_dir>/nltk_data``."""
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "nltk_data", "tokenizers")
    if not os.path.exists(os.path.join(data_dir, "punkt")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["nltk-punkt"])

    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "nltk_data", "corpora")
    if not os.path.exists(os.path.join(data_dir, "stopwords")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["nltk_stopwords"])
Example #2
0
 def save_robust_tuned_model(self, desc, step):
     model_dir = os.path.join(get_root_dir(), "bert_clf",
                              self._dataset_name, desc)
     ckpt_path = os.path.join(model_dir,
                              self._model_init + "-%04dk" % (step // 1000))
     self._model.save_pretrained(ckpt_path)
     logger.info("BERT classifier saved at %s.", ckpt_path)
Example #3
0
def get_corenlp():
    """Download stanford corenlp package.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    if not os.path.exists(os.path.join(data_dir, "stanford-corenlp-4.1.0")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["stanford-corenlp"])
Example #4
0
def get_bert_clf_demo():
    """Download the pretrained classifier for demo dataset."""
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "bert_clf")
    if not os.path.exists(os.path.join(data_dir, "demo")):
        download_file(
            subdir=data_dir,
            **downloadable_resource_urls["bert-base-uncased-clf-demo"])
def update_overview_result(overview_result):
    """write overview result to file.

    Args:
        overview_result (pandas.DataFrame): the overview result.
    """
    result_dir = os.path.join(get_root_dir(), "results")
    os.makedirs(result_dir, exist_ok=True)
    result_filename = os.path.join(result_dir, "overview.csv")
    overview_result.to_csv(result_filename, index=None)
def load_detailed_result():
    """Read detailed results from file.

    Returns:
        (pandas.DataFrame): the detailed result table. Returns an empty DataFrame if file does not
        exist.
    """
    result_dir = os.path.join(get_root_dir(), "results")
    result_filename = os.path.join(result_dir, "detail.csv")
    if os.path.exists(result_filename):
        return pd.read_csv(result_filename)
    else:
        return pd.DataFrame()
Example #7
0
def get_stopwords():
    """Download default stopword words.

    Returns:
        ([str]): a list of strings.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    download_file(subdir=os.path.join(data_dir),
                  **downloadable_resource_urls["default-stopwords"])

    with open(os.path.join(data_dir, "stopwords.txt")) as f:
        stopwords = f.readlines()
    stopwords = [x.strip().lower() for x in stopwords]
    return stopwords
Example #8
0
def get_universal_sentence_encoder():
    """Download pretrained universal sentence encoder.

    Returns:
        (str): directory of the downloaded model.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "tfhub_pretrained",
                            "universal-sentence-encoder-large_5")
    if not os.path.exists(data_dir):
        download_file(
            subdir=os.path.join(data_dir),
            **downloadable_resource_urls["universal-sentence-encoder"])

    return data_dir
Example #9
0
def get_transformers(name):
    """Download pretrained transformer models.

    Args:
        name (str): the name of the pretrained models. options are ``["bert-base-cased",
            "bert-base-uncased", "gpt2-medium"]``.

    Returns:
        (str): directory of the downloaded model.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "transformers_pretrained")
    if not os.path.exists(os.path.join(data_dir, name)):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls[name])

    return os.path.join(data_dir, name)
def get_dataset(dataset_name):
    """
    Users should make sure the data is downloaded to the ``datasets`` folder in fibber root
    dir (default: ``~/cache/datasets``). Otherwise, assertion error is raised.

    Args:
        dataset_name (str): the name of the dataset. See ``https://dai-lab.github.io/fibber/``
            for a full list of built-in datasets.

    Returns:
        (dict, dict): the function returns a tuple of two dict, representing the training set and
        test set respectively.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "datasets")

    if dataset_name == "mnli" or dataset_name == "mnli_mis":
        train_filename = os.path.join(data_dir, "mnli/train.json")
        if dataset_name == "mnli":
            test_filename = os.path.join(data_dir, "mnli/dev_matched.json")
        else:
            test_filename = os.path.join(data_dir, "mnli/dev_mismatched.json")

    else:
        train_filename = os.path.join(data_dir, dataset_name, "train.json")
        test_filename = os.path.join(data_dir, dataset_name, "test.json")

    if not os.path.exists(train_filename) or not os.path.exists(test_filename):
        logger.error("%s dataset not found.", dataset_name)
        assert 0, ("Please use `python3 -m datasets.download_datasets` "
                   "to download datasets.")

    with open(train_filename) as f:
        trainset = json.load(f)

    with open(test_filename) as f:
        testset = json.load(f)

    logger.info("%s training set has %d records.", dataset_name,
                len(trainset["data"]))
    logger.info("%s test set has %d records.", dataset_name,
                len(testset["data"]))
    return trainset, testset
def get_demo_dataset():
    """download demo dataset.

    Returns:
        (dict, dict): trainset and testset.
    """
    download_file(subdir="", **downloadable_dataset_urls["mr-demo"])

    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "mr-demo")

    with open(os.path.join(data_dir, "train.json")) as f:
        trainset = json.load(f)
    with open(os.path.join(data_dir, "test.json")) as f:
        testset = json.load(f)

    logger.info("Demo training set has %d records.", len(trainset["data"]))
    logger.info("Demo test set has %d records.", len(testset["data"]))

    return trainset, testset
def update_detailed_result(aggregated_result, result_dir=None):
    """Read dataset detailed results and add a row to the file. Create a new file if the table
    does not exist.

    Args:
        aggregated_result (dict): the aggregated result as a dict.
        result_dir (str or None): the directory to save results. If None, use
            ``<fibber_root_dir>/results/``.
    """
    if result_dir is None:
        result_dir = os.path.join(get_root_dir(), "results")
    os.makedirs(result_dir, exist_ok=True)
    result_filename = os.path.join(result_dir, "detail.csv")
    if os.path.exists(result_filename):
        results = pd.read_csv(result_filename)
    else:
        results = pd.DataFrame()

    results = results.append(aggregated_result, ignore_index=True)
    results.to_csv(result_filename, index=False)
Example #13
0
def get_glove_emb(download_only=False):
    """Download default pretrained glove embeddings and return a dict.

    We use the 300-dimensional model trained on Wikipedia 2014 + Gigaword 5.
    See https://nlp.stanford.edu/projects/glove/

    Args:
        download_only (bool): set True to only download. (Returns None)

    Returns:
        (dict): a dict of GloVe word embedding model.
            "emb_table": a numpy array of size(N, 300)
            "id2tok": a list of strings.
            "tok2id": a dict that maps word (string) to its id.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    if not os.path.exists(os.path.join(data_dir, "glove.6B.300d.txt")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["default-glove-embeddings"])

    if download_only:
        return None
    return load_glove_model(os.path.join(data_dir, "glove.6B.300d.txt"), 300)
Example #14
0
def load_or_train_bert_clf(model_init, dataset_name, trainset, testset,
                           bert_clf_steps, bert_clf_bs, bert_clf_lr,
                           bert_clf_optimizer, bert_clf_weight_decay,
                           bert_clf_period_summary, bert_clf_period_val,
                           bert_clf_period_save, bert_clf_val_steps, device):
    """Train BERT classification model on a dataset.

    The trained model will be stored at ``<fibber_root_dir>/bert_clf/<dataset_name>/``. If there's
    a saved model, load and return the model. Otherwise, train the model using the given data.

    Args:
        model_init (str): pretrained model name. Choose from ``["bert-base-cased",
            "bert-base-uncased", "bert-large-cased", "bert-large-uncased"]``.
        dataset_name (str): the name of the dataset. This is also the dir to save trained model.
        trainset (dict): a fibber dataset.
        testset (dict): a fibber dataset.
        bert_clf_steps (int): steps to train a classifier.
        bert_clf_bs (int): the batch size.
        bert_clf_lr (float): the learning rate.
        bert_clf_optimizer (str): the optimizer name.
        bert_clf_weight_decay (float): the weight decay.
        bert_clf_period_summary (int): the period in steps to write training summary.
        bert_clf_period_val (int): the period in steps to run validation and write validation
            summary.
        bert_clf_period_save (int): the period in steps to save current model.
        bert_clf_val_steps (int): number of batched in each validation.
        device (torch.Device): the device to run the model.

    Returns:
        (transformers.BertForSequenceClassification): a torch BERT model.
    """
    model_dir = os.path.join(get_root_dir(), "bert_clf", dataset_name)
    ckpt_path = os.path.join(model_dir,
                             model_init + "-%04dk" % (bert_clf_steps // 1000))

    if os.path.exists(ckpt_path):
        logger.info("Load BERT classifier from %s.", ckpt_path)
        model = BertForSequenceClassification.from_pretrained(ckpt_path)
        model.eval()
        model.to(device)
        return model

    num_labels = len(trainset["label_mapping"])
    model = BertForSequenceClassification.from_pretrained(
        utils.get_transformers(model_init), num_labels=num_labels).to(device)
    model.train()

    logger.info("Use %s tokenizer and classifier.", model_init)
    logger.info("Num labels: %s", num_labels)

    summary = SummaryWriter(os.path.join(model_dir, "summary"))

    dataloader = torch.utils.data.DataLoader(DatasetForBert(
        trainset, model_init, bert_clf_bs),
                                             batch_size=None,
                                             num_workers=2)

    dataloader_val = torch.utils.data.DataLoader(DatasetForBert(
        testset, model_init, bert_clf_bs),
                                                 batch_size=None,
                                                 num_workers=1)
    dataloader_val_iter = iter(dataloader_val)

    params = model.parameters()

    opt, sche = get_optimizer(bert_clf_optimizer, bert_clf_lr,
                              bert_clf_weight_decay, bert_clf_steps, params)

    global_step = 0
    correct_train, count_train = 0, 0
    for seq, mask, tok_type, label in tqdm.tqdm(dataloader,
                                                total=bert_clf_steps):
        global_step += 1
        seq = seq.to(device)
        mask = mask.to(device)
        tok_type = tok_type.to(device)
        label = label.to(device)

        outputs = model(seq, mask, tok_type, labels=label)
        loss, logits = outputs[:2]

        count_train += seq.size(0)
        correct_train += (logits.argmax(
            dim=1).eq(label).float().sum().detach().cpu().numpy())

        opt.zero_grad()
        loss.backward()
        opt.step()
        sche.step()

        if global_step % bert_clf_period_summary == 0:
            summary.add_scalar("clf_train/loss", loss, global_step)
            summary.add_scalar("clf_train/error_rate",
                               1 - correct_train / count_train, global_step)
            correct_train, count_train = 0, 0

        if global_step % bert_clf_period_val == 0:
            run_evaluate(model, dataloader_val_iter, bert_clf_val_steps,
                         summary, global_step, device)

        if global_step % bert_clf_period_save == 0 or global_step == bert_clf_steps:
            ckpt_path = os.path.join(
                model_dir, model_init + "-%04dk" % (global_step // 1000))
            if not os.path.exists(ckpt_path):
                os.makedirs(ckpt_path)
            model.save_pretrained(ckpt_path)
            logger.info("BERT classifier saved at %s.", ckpt_path)

        if global_step >= bert_clf_steps:
            break
    model.eval()
    return model
Example #15
0
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import nltk

import argparse

from engine import *
import utils
from utils.download_utils import get_root_dir
from utils import log

logger = log.setup_custom_logger(__name__)
log.remove_logger_tf_handler(logger)

# change cache directory
nltk.data.path += [os.path.join(get_root_dir(), "common", "nltk_data")]
os.environ['TRANSFORMERS_CACHE'] = os.path.join(get_root_dir(), "common",
                                                "transformers_pretrained")
os.environ['TFHUB_CACHE_DIR'] = os.path.join(get_root_dir(), "common",
                                             "tfhub_pretrained")
os.environ['CORENLP_HOME'] = os.path.join(get_root_dir(), "common",
                                          "stanford-corenlp-4.1.0")


def main():
    parser = argparse.ArgumentParser()

    # add experiment args
    parser.add_argument("--dataset", type=str, default="AAPD")
    parser.add_argument("--output_dir", type=str, default=None)
    parser.add_argument('--dataset_path',