def get_bert_clf_demo(): """Download the pretrained classifier for demo dataset.""" data_dir = get_root_dir() data_dir = os.path.join(data_dir, "bert_clf") if not os.path.exists(os.path.join(data_dir, "demo")): download_file( subdir=data_dir, **downloadable_resource_urls["bert-base-uncased-clf-demo"])
def get_corenlp(): """Download stanford corenlp package. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") if not os.path.exists(os.path.join(data_dir, "stanford-corenlp-4.1.0")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["stanford-corenlp"])
def get_nltk_data(): """Download nltk data to ``<fibber_root_dir>/nltk_data``.""" data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "nltk_data", "tokenizers") if not os.path.exists(os.path.join(data_dir, "punkt")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["nltk-punkt"]) data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "nltk_data", "corpora") if not os.path.exists(os.path.join(data_dir, "stopwords")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["nltk_stopwords"])
def get_bert_lm_demo(path="."): """Download the pretrained language model for demo dataset. Since this data is algorithm-specific, it is downloaded to ``path`` instead of ``<fibber_root_dir>``. """ if not os.path.exists(os.path.join(path, "lm_all")): download_file( abs_path=path, **downloadable_resource_urls["bert-base-uncased-lm-demo"]) if not os.path.exists(os.path.join(path, "wordpiece_emb-demo-0500.pt")): download_file(abs_path=path, **downloadable_resource_urls["wpe-demo"])
def get_stopwords(): """Download default stopword words. Returns: ([str]): a list of strings. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["default-stopwords"]) with open(os.path.join(data_dir, "stopwords.txt")) as f: stopwords = f.readlines() stopwords = [x.strip().lower() for x in stopwords] return stopwords
def get_universal_sentence_encoder(): """Download pretrained universal sentence encoder. Returns: (str): directory of the downloaded model. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "tfhub_pretrained", "universal-sentence-encoder-large_5") if not os.path.exists(data_dir): download_file( subdir=os.path.join(data_dir), **downloadable_resource_urls["universal-sentence-encoder"]) return data_dir
def get_transformers(name): """Download pretrained transformer models. Args: name (str): the name of the pretrained models. options are ``["bert-base-cased", "bert-base-uncased", "gpt2-medium"]``. Returns: (str): directory of the downloaded model. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "transformers_pretrained") if not os.path.exists(os.path.join(data_dir, name)): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls[name]) return os.path.join(data_dir, name)
def get_demo_dataset(): """download demo dataset. Returns: (dict, dict): trainset and testset. """ download_file(subdir="", **downloadable_dataset_urls["mr-demo"]) data_dir = get_root_dir() data_dir = os.path.join(data_dir, "mr-demo") with open(os.path.join(data_dir, "train.json")) as f: trainset = json.load(f) with open(os.path.join(data_dir, "test.json")) as f: testset = json.load(f) logger.info("Demo training set has %d records.", len(trainset["data"])) logger.info("Demo test set has %d records.", len(testset["data"])) return trainset, testset
def get_glove_emb(download_only=False): """Download default pretrained glove embeddings and return a dict. We use the 300-dimensional model trained on Wikipedia 2014 + Gigaword 5. See https://nlp.stanford.edu/projects/glove/ Args: download_only (bool): set True to only download. (Returns None) Returns: (dict): a dict of GloVe word embedding model. "emb_table": a numpy array of size(N, 300) "id2tok": a list of strings. "tok2id": a dict that maps word (string) to its id. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") if not os.path.exists(os.path.join(data_dir, "glove.6B.300d.txt")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["default-glove-embeddings"]) if download_only: return None return load_glove_model(os.path.join(data_dir, "glove.6B.300d.txt"), 300)
parser.add_argument('--skip_checksum', action='store_true', help='Skip checksum') parser.add_argument('--skip_extract', action='store_true', help='Skip extracting files') args = parser.parse_args() args.e = args.e or args.dest df = pd.read_csv(args.csv, delimiter=',') if not args.skip_download: for url in df.url: fname = url.split('/')[-1] print("Downloading %s:" % fname) download_file(url=url, dest_folder=args.dest, fname=fname) else: print("Skipping file download") if not args.skip_checksum: for index, row in df.iterrows(): url = row['url'] md5 = row['md5'] fname = url.split('/')[-1] fpath = os.path.join(args.dest, fname) print("Verifing %s: " % fname, end='') ret = md5_checksum(fpath=fpath, target_hash=md5) print("Passed" if ret else "Failed") else: print("Skipping checksum")