def get_nltk_data(): """Download nltk data to ``<fibber_root_dir>/nltk_data``.""" data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "nltk_data", "tokenizers") if not os.path.exists(os.path.join(data_dir, "punkt")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["nltk-punkt"]) data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "nltk_data", "corpora") if not os.path.exists(os.path.join(data_dir, "stopwords")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["nltk_stopwords"])
def save_robust_tuned_model(self, desc, step): model_dir = os.path.join(get_root_dir(), "bert_clf", self._dataset_name, desc) ckpt_path = os.path.join(model_dir, self._model_init + "-%04dk" % (step // 1000)) self._model.save_pretrained(ckpt_path) logger.info("BERT classifier saved at %s.", ckpt_path)
def get_corenlp(): """Download stanford corenlp package. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") if not os.path.exists(os.path.join(data_dir, "stanford-corenlp-4.1.0")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["stanford-corenlp"])
def get_bert_clf_demo(): """Download the pretrained classifier for demo dataset.""" data_dir = get_root_dir() data_dir = os.path.join(data_dir, "bert_clf") if not os.path.exists(os.path.join(data_dir, "demo")): download_file( subdir=data_dir, **downloadable_resource_urls["bert-base-uncased-clf-demo"])
def update_overview_result(overview_result): """write overview result to file. Args: overview_result (pandas.DataFrame): the overview result. """ result_dir = os.path.join(get_root_dir(), "results") os.makedirs(result_dir, exist_ok=True) result_filename = os.path.join(result_dir, "overview.csv") overview_result.to_csv(result_filename, index=None)
def load_detailed_result(): """Read detailed results from file. Returns: (pandas.DataFrame): the detailed result table. Returns an empty DataFrame if file does not exist. """ result_dir = os.path.join(get_root_dir(), "results") result_filename = os.path.join(result_dir, "detail.csv") if os.path.exists(result_filename): return pd.read_csv(result_filename) else: return pd.DataFrame()
def get_stopwords(): """Download default stopword words. Returns: ([str]): a list of strings. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["default-stopwords"]) with open(os.path.join(data_dir, "stopwords.txt")) as f: stopwords = f.readlines() stopwords = [x.strip().lower() for x in stopwords] return stopwords
def get_universal_sentence_encoder(): """Download pretrained universal sentence encoder. Returns: (str): directory of the downloaded model. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "tfhub_pretrained", "universal-sentence-encoder-large_5") if not os.path.exists(data_dir): download_file( subdir=os.path.join(data_dir), **downloadable_resource_urls["universal-sentence-encoder"]) return data_dir
def get_transformers(name): """Download pretrained transformer models. Args: name (str): the name of the pretrained models. options are ``["bert-base-cased", "bert-base-uncased", "gpt2-medium"]``. Returns: (str): directory of the downloaded model. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "transformers_pretrained") if not os.path.exists(os.path.join(data_dir, name)): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls[name]) return os.path.join(data_dir, name)
def get_dataset(dataset_name): """ Users should make sure the data is downloaded to the ``datasets`` folder in fibber root dir (default: ``~/cache/datasets``). Otherwise, assertion error is raised. Args: dataset_name (str): the name of the dataset. See ``https://dai-lab.github.io/fibber/`` for a full list of built-in datasets. Returns: (dict, dict): the function returns a tuple of two dict, representing the training set and test set respectively. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "datasets") if dataset_name == "mnli" or dataset_name == "mnli_mis": train_filename = os.path.join(data_dir, "mnli/train.json") if dataset_name == "mnli": test_filename = os.path.join(data_dir, "mnli/dev_matched.json") else: test_filename = os.path.join(data_dir, "mnli/dev_mismatched.json") else: train_filename = os.path.join(data_dir, dataset_name, "train.json") test_filename = os.path.join(data_dir, dataset_name, "test.json") if not os.path.exists(train_filename) or not os.path.exists(test_filename): logger.error("%s dataset not found.", dataset_name) assert 0, ("Please use `python3 -m datasets.download_datasets` " "to download datasets.") with open(train_filename) as f: trainset = json.load(f) with open(test_filename) as f: testset = json.load(f) logger.info("%s training set has %d records.", dataset_name, len(trainset["data"])) logger.info("%s test set has %d records.", dataset_name, len(testset["data"])) return trainset, testset
def get_demo_dataset(): """download demo dataset. Returns: (dict, dict): trainset and testset. """ download_file(subdir="", **downloadable_dataset_urls["mr-demo"]) data_dir = get_root_dir() data_dir = os.path.join(data_dir, "mr-demo") with open(os.path.join(data_dir, "train.json")) as f: trainset = json.load(f) with open(os.path.join(data_dir, "test.json")) as f: testset = json.load(f) logger.info("Demo training set has %d records.", len(trainset["data"])) logger.info("Demo test set has %d records.", len(testset["data"])) return trainset, testset
def update_detailed_result(aggregated_result, result_dir=None): """Read dataset detailed results and add a row to the file. Create a new file if the table does not exist. Args: aggregated_result (dict): the aggregated result as a dict. result_dir (str or None): the directory to save results. If None, use ``<fibber_root_dir>/results/``. """ if result_dir is None: result_dir = os.path.join(get_root_dir(), "results") os.makedirs(result_dir, exist_ok=True) result_filename = os.path.join(result_dir, "detail.csv") if os.path.exists(result_filename): results = pd.read_csv(result_filename) else: results = pd.DataFrame() results = results.append(aggregated_result, ignore_index=True) results.to_csv(result_filename, index=False)
def get_glove_emb(download_only=False): """Download default pretrained glove embeddings and return a dict. We use the 300-dimensional model trained on Wikipedia 2014 + Gigaword 5. See https://nlp.stanford.edu/projects/glove/ Args: download_only (bool): set True to only download. (Returns None) Returns: (dict): a dict of GloVe word embedding model. "emb_table": a numpy array of size(N, 300) "id2tok": a list of strings. "tok2id": a dict that maps word (string) to its id. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") if not os.path.exists(os.path.join(data_dir, "glove.6B.300d.txt")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["default-glove-embeddings"]) if download_only: return None return load_glove_model(os.path.join(data_dir, "glove.6B.300d.txt"), 300)
def load_or_train_bert_clf(model_init, dataset_name, trainset, testset, bert_clf_steps, bert_clf_bs, bert_clf_lr, bert_clf_optimizer, bert_clf_weight_decay, bert_clf_period_summary, bert_clf_period_val, bert_clf_period_save, bert_clf_val_steps, device): """Train BERT classification model on a dataset. The trained model will be stored at ``<fibber_root_dir>/bert_clf/<dataset_name>/``. If there's a saved model, load and return the model. Otherwise, train the model using the given data. Args: model_init (str): pretrained model name. Choose from ``["bert-base-cased", "bert-base-uncased", "bert-large-cased", "bert-large-uncased"]``. dataset_name (str): the name of the dataset. This is also the dir to save trained model. trainset (dict): a fibber dataset. testset (dict): a fibber dataset. bert_clf_steps (int): steps to train a classifier. bert_clf_bs (int): the batch size. bert_clf_lr (float): the learning rate. bert_clf_optimizer (str): the optimizer name. bert_clf_weight_decay (float): the weight decay. bert_clf_period_summary (int): the period in steps to write training summary. bert_clf_period_val (int): the period in steps to run validation and write validation summary. bert_clf_period_save (int): the period in steps to save current model. bert_clf_val_steps (int): number of batched in each validation. device (torch.Device): the device to run the model. Returns: (transformers.BertForSequenceClassification): a torch BERT model. """ model_dir = os.path.join(get_root_dir(), "bert_clf", dataset_name) ckpt_path = os.path.join(model_dir, model_init + "-%04dk" % (bert_clf_steps // 1000)) if os.path.exists(ckpt_path): logger.info("Load BERT classifier from %s.", ckpt_path) model = BertForSequenceClassification.from_pretrained(ckpt_path) model.eval() model.to(device) return model num_labels = len(trainset["label_mapping"]) model = BertForSequenceClassification.from_pretrained( utils.get_transformers(model_init), num_labels=num_labels).to(device) model.train() logger.info("Use %s tokenizer and classifier.", model_init) logger.info("Num labels: %s", num_labels) summary = SummaryWriter(os.path.join(model_dir, "summary")) dataloader = torch.utils.data.DataLoader(DatasetForBert( trainset, model_init, bert_clf_bs), batch_size=None, num_workers=2) dataloader_val = torch.utils.data.DataLoader(DatasetForBert( testset, model_init, bert_clf_bs), batch_size=None, num_workers=1) dataloader_val_iter = iter(dataloader_val) params = model.parameters() opt, sche = get_optimizer(bert_clf_optimizer, bert_clf_lr, bert_clf_weight_decay, bert_clf_steps, params) global_step = 0 correct_train, count_train = 0, 0 for seq, mask, tok_type, label in tqdm.tqdm(dataloader, total=bert_clf_steps): global_step += 1 seq = seq.to(device) mask = mask.to(device) tok_type = tok_type.to(device) label = label.to(device) outputs = model(seq, mask, tok_type, labels=label) loss, logits = outputs[:2] count_train += seq.size(0) correct_train += (logits.argmax( dim=1).eq(label).float().sum().detach().cpu().numpy()) opt.zero_grad() loss.backward() opt.step() sche.step() if global_step % bert_clf_period_summary == 0: summary.add_scalar("clf_train/loss", loss, global_step) summary.add_scalar("clf_train/error_rate", 1 - correct_train / count_train, global_step) correct_train, count_train = 0, 0 if global_step % bert_clf_period_val == 0: run_evaluate(model, dataloader_val_iter, bert_clf_val_steps, summary, global_step, device) if global_step % bert_clf_period_save == 0 or global_step == bert_clf_steps: ckpt_path = os.path.join( model_dir, model_init + "-%04dk" % (global_step // 1000)) if not os.path.exists(ckpt_path): os.makedirs(ckpt_path) model.save_pretrained(ckpt_path) logger.info("BERT classifier saved at %s.", ckpt_path) if global_step >= bert_clf_steps: break model.eval() return model
os.environ["TOKENIZERS_PARALLELISM"] = "false" import nltk import argparse from engine import * import utils from utils.download_utils import get_root_dir from utils import log logger = log.setup_custom_logger(__name__) log.remove_logger_tf_handler(logger) # change cache directory nltk.data.path += [os.path.join(get_root_dir(), "common", "nltk_data")] os.environ['TRANSFORMERS_CACHE'] = os.path.join(get_root_dir(), "common", "transformers_pretrained") os.environ['TFHUB_CACHE_DIR'] = os.path.join(get_root_dir(), "common", "tfhub_pretrained") os.environ['CORENLP_HOME'] = os.path.join(get_root_dir(), "common", "stanford-corenlp-4.1.0") def main(): parser = argparse.ArgumentParser() # add experiment args parser.add_argument("--dataset", type=str, default="AAPD") parser.add_argument("--output_dir", type=str, default=None) parser.add_argument('--dataset_path',