Ejemplo n.º 1
0
 def save_robust_tuned_model(self, desc, step):
     model_dir = os.path.join(get_root_dir(), "bert_clf",
                              self._dataset_name, desc)
     ckpt_path = os.path.join(model_dir,
                              self._model_init + "-%04dk" % (step // 1000))
     self._model.save_pretrained(ckpt_path)
     logger.info("BERT classifier saved at %s.", ckpt_path)
Ejemplo n.º 2
0
def update_attack_robust_result(aggregated_result, robust_tuned_clf_desc,
                                robust_tuning_steps, result_dir=None):
    """Read results of attacking robust classifiers, and add a row to the file.
    Create a new file if the table does not exist.

    Args:
        aggregated_result (dict): the aggregated result as a dict.
        robust_tuned_clf_desc (str): the robust tuning description.
        robust_tuning_steps (int): the number of robust tuning steps.
        result_dir (str or None): the directory to save results. If None, use
            ``<fibber_root_dir>/results/``.
    """
    if result_dir is None:
        result_dir = os.path.join(get_root_dir(), "results")
    os.makedirs(result_dir, exist_ok=True)
    result_filename = os.path.join(result_dir, "robust_detail.csv")
    if os.path.exists(result_filename):
        results = pd.read_csv(result_filename)
    else:
        results = pd.DataFrame()

    aggregated_result["robust_tuned_clf_desc"] = robust_tuned_clf_desc
    aggregated_result["robust_tuning_steps"] = robust_tuning_steps
    results = results.append(aggregated_result, ignore_index=True)
    results = reorder_columns(results)
    results.to_csv(result_filename, index=False)
Ejemplo n.º 3
0
 def load_robust_tuned_model(self, desc, step):
     model_dir = os.path.join(get_root_dir(), "bert_clf",
                              self._dataset_name, desc)
     ckpt_path = os.path.join(model_dir,
                              self._model_init + "-%04dk" % (step // 1000))
     self._model = BertForSequenceClassification.from_pretrained(ckpt_path)
     self._model.eval()
     self._model.to(self._device)
     logger.info("Load BERT classifier from %s.", ckpt_path)
Ejemplo n.º 4
0
def update_overview_result(overview_result):
    """write overview result to file.

    Args:
        overview_result (pandas.DataFrame): the overview result.
    """
    result_dir = os.path.join(get_root_dir(), "results")
    os.makedirs(result_dir, exist_ok=True)
    result_filename = os.path.join(result_dir, "overview.csv")
    overview_result.to_csv(result_filename, index=None)
Ejemplo n.º 5
0
def load_detailed_result():
    """Read detailed results from file.

    Returns:
        (pandas.DataFrame): the detailed result table. Returns an empty DataFrame if file does not
        exist.
    """
    result_dir = os.path.join(get_root_dir(), "results")
    result_filename = os.path.join(result_dir, "detail.csv")
    if os.path.exists(result_filename):
        return pd.read_csv(result_filename)
    else:
        return pd.DataFrame()
Ejemplo n.º 6
0
def download_file(filename,
                  url,
                  md5,
                  subdir=None,
                  untar=False,
                  unzip=False,
                  abs_path=None):
    """Download file from a given url.

    This downloads a file to ``<fibber_root_dir>/subdir``. If the file already exists and the md5
    matches, using the existing file.

    Args:
        filename (str): filename as a string.
        url (str): the url to download the file.
        md5 (str): the md5 checksum of the file.
        subdir (str): the subdir to save the file. Dir will be created if not exists.
        untar (bool): whether to untar the file.
        unzip (bool): whether to unzip the file.
        abs_path (str): a folder to download files. (ignore fibber_root_dir)
    """
    target_dir = get_root_dir()
    if subdir is not None:
        target_dir = os.path.join(target_dir, subdir)
    if abs_path is not None:
        target_dir = abs_path
    os.makedirs(target_dir, exist_ok=True)
    target_file_absolute_path = os.path.join(target_dir, filename)

    if (os.path.exists(target_file_absolute_path)
            and check_file_md5(target_file_absolute_path, md5)):
        logger.info("Load %s from cache. md5 checksum is correct.", filename)
        if untar:
            my_tar = tarfile.open(target_file_absolute_path)
            my_tar.extractall(target_dir)
            my_tar.close()
        if unzip:
            my_zip = zipfile.ZipFile(target_file_absolute_path, "r")
            my_zip.extractall(target_dir)
            my_zip.close()
    else:
        logger.info("Download %s to %s", filename, target_dir)
        tf_get_file(filename,
                    origin=url,
                    cache_subdir="",
                    file_hash=md5,
                    extract=untar or unzip,
                    cache_dir=target_dir)
Ejemplo n.º 7
0
def get_dataset(dataset_name):
    """Load dataset from fibber root directory.

    Users should make sure the data is downloaded to the ``datasets`` folder in fibber root
    dir (default: ``~/.fibber/datasets``). Otherwise, assertion error is raised.

    Args:
        dataset_name (str): the name of the dataset. See ``https://dai-lab.github.io/fibber/``
            for a full list of built-in datasets.

    Returns:
        (dict, dict): the function returns a tuple of two dict, representing the training set and
        test set respectively.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "datasets")

    if dataset_name == "mnli" or dataset_name == "mnli_mis":
        train_filename = os.path.join(data_dir, "mnli/train.json")
        if dataset_name == "mnli":
            test_filename = os.path.join(data_dir, "mnli/dev_matched.json")
        else:
            test_filename = os.path.join(data_dir, "mnli/dev_mismatched.json")

    else:
        train_filename = os.path.join(data_dir, dataset_name, "train.json")
        test_filename = os.path.join(data_dir, dataset_name, "test.json")

    if not os.path.exists(train_filename) or not os.path.exists(test_filename):
        logger.error("%s dataset not found.", dataset_name)
        assert 0, ("Please use `python3 -m fibber.datasets.download_datasets` "
                   "to download datasets.")

    with open(train_filename) as f:
        trainset = json.load(f)

    with open(test_filename) as f:
        testset = json.load(f)

    logger.info("%s training set has %d records.", dataset_name,
                len(trainset["data"]))
    logger.info("%s test set has %d records.", dataset_name,
                len(testset["data"]))
    return trainset, testset
Ejemplo n.º 8
0
def get_demo_dataset():
    """download demo dataset.

    Returns:
        (dict, dict): trainset and testset.
    """
    download_file(subdir="", **downloadable_dataset_urls["mr-demo"])

    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "mr-demo")

    with open(os.path.join(data_dir, "train.json")) as f:
        trainset = json.load(f)
    with open(os.path.join(data_dir, "test.json")) as f:
        testset = json.load(f)

    logger.info("Demo training set has %d records.", len(trainset["data"]))
    logger.info("Demo test set has %d records.", len(testset["data"]))

    return trainset, testset
Ejemplo n.º 9
0
def update_detailed_result(aggregated_result, result_dir=None):
    """Read dataset detailed results and add a row to the file. Create a new file if the table
    does not exist.

    Args:
        aggregated_result (dict): the aggregated result as a dict.
        result_dir (str or None): the directory to save results. If None, use
            ``<fibber_root_dir>/results/``.
    """
    if result_dir is None:
        result_dir = os.path.join(get_root_dir(), "results")
    os.makedirs(result_dir, exist_ok=True)
    result_filename = os.path.join(result_dir, "detail.csv")
    if os.path.exists(result_filename):
        results = pd.read_csv(result_filename)
    else:
        results = pd.DataFrame()

    results = results.append(aggregated_result, ignore_index=True)
    results = reorder_columns(results)
    results.to_csv(result_filename, index=False)
Ejemplo n.º 10
0
    "ag": preprocess_ag.download_and_preprocess_ag,
    "imdb": preprocess_imdb.download_and_preprocess_imdb,
    "mnli": preprocess_mnli.download_and_preprocess_mnli,
    "mr": preprocess_mr.download_and_preprocess_mr,
    "snli": preprocess_snli.download_and_preprocess_snli,
    "yelp": preprocess_yelp.download_and_preprocess_yelp
}

if __name__ == "__main__":
    FLAGS = parser.parse_args()

    if FLAGS.process_raw == "1":
        for name, processing_func in DATASET_PREPROCESS_FN.items():
            logger.info("Start download and process %s.", name)
            processing_func()

    else:
        download_file(subdir="",
                      **downloadable_dataset_urls["processed-datasets"])

    if FLAGS.verify == "1":
        root_dir = get_root_dir()
        datasets_dir = os.path.join(root_dir, "datasets")
        dataset_json_list = sorted(glob.glob(datasets_dir + "/*/*.json"))
        for json_filename in dataset_json_list:
            logger.info("Verify %s.", json_filename)
            with open(json_filename) as f:
                data = json.load(f)

            verify_dataset(data)
Ejemplo n.º 11
0
def load_or_train_bert_clf(model_init, dataset_name, trainset, testset,
                           bert_clf_steps, bert_clf_bs, bert_clf_lr,
                           bert_clf_optimizer, bert_clf_weight_decay,
                           bert_clf_period_summary, bert_clf_period_val,
                           bert_clf_period_save, bert_clf_val_steps, device):
    """Train BERT classification model on a dataset.

    The trained model will be stored at ``<fibber_root_dir>/bert_clf/<dataset_name>/``. If there's
    a saved model, load and return the model. Otherwise, train the model using the given data.

    Args:
        model_init (str): pretrained model name. Choose from ``["bert-base-cased",
            "bert-base-uncased", "bert-large-cased", "bert-large-uncased"]``.
        dataset_name (str): the name of the dataset. This is also the dir to save trained model.
        trainset (dict): a fibber dataset.
        testset (dict): a fibber dataset.
        bert_clf_steps (int): steps to train a classifier.
        bert_clf_bs (int): the batch size.
        bert_clf_lr (float): the learning rate.
        bert_clf_optimizer (str): the optimizer name.
        bert_clf_weight_decay (float): the weight decay.
        bert_clf_period_summary (int): the period in steps to write training summary.
        bert_clf_period_val (int): the period in steps to run validation and write validation
            summary.
        bert_clf_period_save (int): the period in steps to save current model.
        bert_clf_val_steps (int): number of batched in each validation.
        device (torch.Device): the device to run the model.

    Returns:
        (transformers.BertForSequenceClassification): a torch BERT model.
    """
    model_dir = os.path.join(get_root_dir(), "bert_clf", dataset_name)
    ckpt_path = os.path.join(model_dir,
                             model_init + "-%04dk" % (bert_clf_steps // 1000))

    if os.path.exists(ckpt_path):
        logger.info("Load BERT classifier from %s.", ckpt_path)
        model = BertForSequenceClassification.from_pretrained(ckpt_path)
        model.eval()
        model.to(device)
        return model

    num_labels = len(trainset["label_mapping"])

    model = BertForSequenceClassification.from_pretrained(
        resources.get_transformers(model_init),
        num_labels=num_labels).to(device)
    model.train()

    logger.info("Use %s tokenizer and classifier.", model_init)
    logger.info("Num labels: %s", num_labels)

    summary = SummaryWriter(os.path.join(model_dir, "summary"))

    dataloader = torch.utils.data.DataLoader(DatasetForBert(
        trainset, model_init, bert_clf_bs),
                                             batch_size=None,
                                             num_workers=2)

    dataloader_val = torch.utils.data.DataLoader(DatasetForBert(
        testset, model_init, bert_clf_bs),
                                                 batch_size=None,
                                                 num_workers=1)
    dataloader_val_iter = iter(dataloader_val)

    params = model.parameters()

    opt, sche = get_optimizer(bert_clf_optimizer, bert_clf_lr,
                              bert_clf_weight_decay, bert_clf_steps, params)

    global_step = 0
    correct_train, count_train = 0, 0
    for seq, mask, tok_type, label in tqdm.tqdm(dataloader,
                                                total=bert_clf_steps):
        global_step += 1
        seq = seq.to(device)
        mask = mask.to(device)
        tok_type = tok_type.to(device)
        label = label.to(device)

        outputs = model(seq, mask, tok_type, labels=label)
        loss, logits = outputs[:2]

        count_train += seq.size(0)
        correct_train += (logits.argmax(
            dim=1).eq(label).float().sum().detach().cpu().numpy())

        opt.zero_grad()
        loss.backward()
        opt.step()
        sche.step()

        if global_step % bert_clf_period_summary == 0:
            summary.add_scalar("clf_train/loss", loss, global_step)
            summary.add_scalar("clf_train/error_rate",
                               1 - correct_train / count_train, global_step)
            correct_train, count_train = 0, 0

        if global_step % bert_clf_period_val == 0:
            run_evaluate(model, dataloader_val_iter, bert_clf_val_steps,
                         summary, global_step, device)

        if global_step % bert_clf_period_save == 0 or global_step == bert_clf_steps:
            ckpt_path = os.path.join(
                model_dir, model_init + "-%04dk" % (global_step // 1000))
            model.save_pretrained(ckpt_path)
            logger.info("BERT classifier saved at %s.", ckpt_path)

        if global_step >= bert_clf_steps:
            break
    model.eval()
    return model