Esempio n. 1
0
def test_h2o3_smalldata(f):
    ignored_files = {
        # Zip files containing >1 files
        os.path.join("gbm_test", "bank-full.csv.zip"),
        os.path.join("jira", "pub-999.zip"),
        os.path.join("parser", "hexdev_497", "airlines_no_first_header.zip"),
        os.path.join("parser", "hexdev_497", "airlines_first_header.zip"),
        os.path.join("parser", "hexdev_497", "airlines_small_csv.zip"),
        os.path.join("prostate", "prostate.bin.csv.zip"),
        os.path.join("smalldata", "images", "cat_dog_tiny_thumbnails.zip"),
        # Others
        os.path.join("arff", "folder1", "iris0.csv"),
        os.path.join("jira", "pubdev_2897.csv"),
        os.path.join("jira", "runit_pubdev_3590_unexpected_column.csv"),
        os.path.join("junit", "iris.xls.zip"),
        os.path.join("junit", "test_parse_mix.csv"),
        os.path.join("junit", "arff", "jm1_arff.txt"),
        os.path.join("junit", "arff", "jm1.arff.txt"),
        os.path.join("merge", "livestock.nuts.csv"),
        os.path.join("merge", "tourism.csv"),
        os.path.join("parser", "column.csv"),
    }
    if any(ff in f for ff in ignored_files):
        pytest.skip("On the ignored files list")
    else:
        params = {}
        if is_ppc64():
            params["nthreads"] = 8
            pytest.skip("Fread tests disabled on PPC64")
            return
        if "test_pubdev3589" in f:
            params["sep"] = "\n"
        if ("single_quotes_mixed.csv" in f or
            "single_quotes_with_escaped_quotes.csv" in f or
            "single_quotes_with_escaped_quotes_custom_escapechar.csv" in f):
            params["quotechar"] = "'"
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            DT = dt.fread(f, **params)
            frame_integrity_check(DT)
Esempio n. 2
0
def test_h2o3_bigdata(f):
    ignored_files = {
        # Feather files
        os.path.join("ipums_feather.gz"),
        # empty files
        os.path.join("mnist", "t10k-images-idx3-ubyte.gz"),
        os.path.join("mnist", "t10k-labels-idx1-ubyte.gz"),
        os.path.join("mnist", "train-images-idx3-ubyte.gz"),
        os.path.join("mnist", "train-labels-idx1-ubyte.gz"),
        # ARFF files
        os.path.join("parser", "anARFFFile.txt"),
        # zip files having more than 1 file inside
        os.path.join("flights-nyc", "delays14.csv.zip"),
        os.path.join("flights-nyc", "flights14.csv.zip"),
        os.path.join("flights-nyc", "weather_delays14.csv.zip"),
        os.path.join("images", "demo_disney_data.zip"),  # jpegs...
        os.path.join("jira", "la1s.wc.arff.txt.zip"),
        os.path.join("jira", "re0.wc.arff.txt.zip"),
        os.path.join("jira", "rotterdam.csv.zip"),
        os.path.join("parser", "hexdev_497", "milsongs_csv.zip"),
        os.path.join("glm", "GLM_model_python_1543520565753_1.zip"),
        os.path.join("glm", "GLM_model_python_1543520565753_3.zip"),
        os.path.join("glm", "GLM_model_python_1544561074878_1.zip"),
        # requires `comment` parameter
        os.path.join("new-poker-hand.full.311M.txt"),
        # files with 36M columns
        os.path.join("testng", "newsgroup_train1.csv"),
        os.path.join("testng", "newsgroup_validation1.csv"),
        # broken CRC zip files
        os.path.join("jira", "tenThousandCat50C.csv.zip"),
        os.path.join("jira", "tenThousandCat100C.csv.zip"),
        os.path.join("parser", "year2005.csv.gz"),
    }
    filledna_files = {
        os.path.join("lending-club", "LoanStats3a.csv"),
        os.path.join("lending-club", "LoanStats3b.csv"),
        os.path.join("lending-club", "LoanStats3c.csv"),
        os.path.join("lending-club", "LoanStats3d.csv"),
        os.path.join("LoanStats3a.csv"),
        os.path.join("LoanStats3b.csv"),
        os.path.join("LoanStats3c.csv"),
        os.path.join("LoanStats3d.csv"),
        os.path.join("Kaggle_Product_BO_Test_v2.csv.zip"),
        os.path.join("Kaggle_Product_BO_Training_v2.csv.zip"),
    }
    if any(ff in f for ff in ignored_files):
        pytest.skip("On the ignored files list")
        return

    params = {"memory_limit": MEMORY_LIMIT}
    if is_ppc64():
        params["nthreads"] = 8
        pytest.skip("Fread tests disabled on PPC64")
        return
    if any(ff in f for ff in filledna_files):
        params["fill"] = True
    if "imagenet/cat_dog_mouse.tgz" in f:
        f = os.path.join(f, "cat_dog_mouse.csv")

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        DT = dt.fread(f, **params)
        frame_integrity_check(DT)