Beispiel #1
0
def get_bert_clf_demo():
    """Download the pretrained classifier for demo dataset."""
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "bert_clf")
    if not os.path.exists(os.path.join(data_dir, "demo")):
        download_file(
            subdir=data_dir,
            **downloadable_resource_urls["bert-base-uncased-clf-demo"])
Beispiel #2
0
def get_corenlp():
    """Download stanford corenlp package.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    if not os.path.exists(os.path.join(data_dir, "stanford-corenlp-4.1.0")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["stanford-corenlp"])
Beispiel #3
0
def get_nltk_data():
    """Download nltk data to ``<fibber_root_dir>/nltk_data``."""
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "nltk_data", "tokenizers")
    if not os.path.exists(os.path.join(data_dir, "punkt")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["nltk-punkt"])

    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "nltk_data", "corpora")
    if not os.path.exists(os.path.join(data_dir, "stopwords")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["nltk_stopwords"])
Beispiel #4
0
def get_bert_lm_demo(path="."):
    """Download the pretrained language model for demo dataset.

    Since this data is algorithm-specific, it is downloaded to ``path`` instead of
    ``<fibber_root_dir>``.
    """
    if not os.path.exists(os.path.join(path, "lm_all")):
        download_file(
            abs_path=path,
            **downloadable_resource_urls["bert-base-uncased-lm-demo"])

    if not os.path.exists(os.path.join(path, "wordpiece_emb-demo-0500.pt")):
        download_file(abs_path=path, **downloadable_resource_urls["wpe-demo"])
Beispiel #5
0
def get_stopwords():
    """Download default stopword words.

    Returns:
        ([str]): a list of strings.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    download_file(subdir=os.path.join(data_dir),
                  **downloadable_resource_urls["default-stopwords"])

    with open(os.path.join(data_dir, "stopwords.txt")) as f:
        stopwords = f.readlines()
    stopwords = [x.strip().lower() for x in stopwords]
    return stopwords
Beispiel #6
0
def get_universal_sentence_encoder():
    """Download pretrained universal sentence encoder.

    Returns:
        (str): directory of the downloaded model.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "tfhub_pretrained",
                            "universal-sentence-encoder-large_5")
    if not os.path.exists(data_dir):
        download_file(
            subdir=os.path.join(data_dir),
            **downloadable_resource_urls["universal-sentence-encoder"])

    return data_dir
Beispiel #7
0
def get_transformers(name):
    """Download pretrained transformer models.

    Args:
        name (str): the name of the pretrained models. options are ``["bert-base-cased",
            "bert-base-uncased", "gpt2-medium"]``.

    Returns:
        (str): directory of the downloaded model.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "transformers_pretrained")
    if not os.path.exists(os.path.join(data_dir, name)):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls[name])

    return os.path.join(data_dir, name)
def get_demo_dataset():
    """download demo dataset.

    Returns:
        (dict, dict): trainset and testset.
    """
    download_file(subdir="", **downloadable_dataset_urls["mr-demo"])

    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "mr-demo")

    with open(os.path.join(data_dir, "train.json")) as f:
        trainset = json.load(f)
    with open(os.path.join(data_dir, "test.json")) as f:
        testset = json.load(f)

    logger.info("Demo training set has %d records.", len(trainset["data"]))
    logger.info("Demo test set has %d records.", len(testset["data"]))

    return trainset, testset
Beispiel #9
0
def get_glove_emb(download_only=False):
    """Download default pretrained glove embeddings and return a dict.

    We use the 300-dimensional model trained on Wikipedia 2014 + Gigaword 5.
    See https://nlp.stanford.edu/projects/glove/

    Args:
        download_only (bool): set True to only download. (Returns None)

    Returns:
        (dict): a dict of GloVe word embedding model.
            "emb_table": a numpy array of size(N, 300)
            "id2tok": a list of strings.
            "tok2id": a dict that maps word (string) to its id.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    if not os.path.exists(os.path.join(data_dir, "glove.6B.300d.txt")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["default-glove-embeddings"])

    if download_only:
        return None
    return load_glove_model(os.path.join(data_dir, "glove.6B.300d.txt"), 300)
parser.add_argument('--skip_checksum', action='store_true',
                    help='Skip checksum')
parser.add_argument('--skip_extract', action='store_true',
                    help='Skip extracting files')
args = parser.parse_args()
args.e = args.e or args.dest


df = pd.read_csv(args.csv, delimiter=',')


if not args.skip_download:
    for url in df.url:
        fname = url.split('/')[-1]
        print("Downloading %s:" % fname)
        download_file(url=url, dest_folder=args.dest, fname=fname)
else:
    print("Skipping file download")


if not args.skip_checksum:
    for index, row in df.iterrows():
        url = row['url']
        md5 = row['md5']
        fname = url.split('/')[-1]
        fpath = os.path.join(args.dest, fname)
        print("Verifing %s: " % fname, end='')
        ret = md5_checksum(fpath=fpath, target_hash=md5)
        print("Passed" if ret else "Failed")
else:
    print("Skipping checksum")