コード例 #1
0
def test_subdir_reader_configurable_reader_method(basic_pandas_datasource,
                                                  tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_subdir_reader_configurable_reader_method"))
    mock_files = [
        "20190101__asset_1.dat",
        "20190102__asset_1.dat",
        "20190103__asset_1.dat",
        "asset_2/20190101__asset_2.dat",
        "asset_2/20190102__asset_2.dat",
    ]
    for file in mock_files:
        if "/" in file:
            os.makedirs(os.path.join(base_directory,
                                     file.split("/")[0]),
                        exist_ok=True)
        open(os.path.join(base_directory, file), "w").close()

    # If we have files, we should see them as individual assets
    subdir_reader_generator = SubdirReaderBatchKwargsGenerator(
        "test_generator",
        datasource=basic_pandas_datasource,
        base_directory=base_directory,
        reader_method="csv",
        known_extensions=[".dat"],
    )
    batch_kwargs = next(
        subdir_reader_generator.get_iterator(data_asset_name="asset_2"))
    assert batch_kwargs["reader_method"] == "csv"
コード例 #2
0
def test_file_kwargs_generator_extensions(tmp_path_factory):
    """csv, xls, parquet, json should be recognized file extensions"""
    basedir = str(
        tmp_path_factory.mktemp("test_file_kwargs_generator_extensions"))

    # Do not include: invalid extension
    with open(os.path.join(basedir, "f1.blarg"), "w") as outfile:
        outfile.write("\n\n\n")
    # Include
    with open(os.path.join(basedir, "f2.csv"), "w") as outfile:
        outfile.write("\n\n\n")
    # Do not include: valid subdir, but no valid files in it
    os.mkdir(os.path.join(basedir, "f3"))
    with open(os.path.join(basedir, "f3", "f3_1.blarg"), "w") as outfile:
        outfile.write("\n\n\n")
    with open(os.path.join(basedir, "f3", "f3_2.blarg"), "w") as outfile:
        outfile.write("\n\n\n")
    # Include: valid subdir with valid files
    os.mkdir(os.path.join(basedir, "f4"))
    with open(os.path.join(basedir, "f4", "f4_1.csv"), "w") as outfile:
        outfile.write("\n\n\n")
    with open(os.path.join(basedir, "f4", "f4_2.csv"), "w") as outfile:
        outfile.write("\n\n\n")
    # Do not include: valid extension, but dot prefix
    with open(os.path.join(basedir, ".f5.csv"), "w") as outfile:
        outfile.write("\n\n\n")

    # Include: valid extensions
    with open(os.path.join(basedir, "f6.tsv"), "w") as outfile:
        outfile.write("\n\n\n")
    with open(os.path.join(basedir, "f7.xls"), "w") as outfile:
        outfile.write("\n\n\n")
    with open(os.path.join(basedir, "f8.parquet"), "w") as outfile:
        outfile.write("\n\n\n")
    with open(os.path.join(basedir, "f9.xls"), "w") as outfile:
        outfile.write("\n\n\n")
    with open(os.path.join(basedir, "f0.json"), "w") as outfile:
        outfile.write("\n\n\n")

    g1 = SubdirReaderBatchKwargsGenerator(datasource="foo",
                                          base_directory=basedir)

    g1_assets = g1.get_available_data_asset_names()
    # Use set in test to avoid order issues
    assert set(g1_assets["names"]) == {
        ("f7", "file"),
        ("f4", "directory"),
        ("f6", "file"),
        ("f0", "file"),
        ("f2", "file"),
        ("f9", "file"),
        ("f8", "file"),
    }
コード例 #3
0
def test_subdir_reader_configurable_reader_method(basic_pandas_datasource,
                                                  tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_folder_connection_path"))
    mock_files = [
        "20190101__asset_1.dat", "20190102__asset_1.dat",
        "20190103__asset_1.dat", "asset_2/20190101__asset_2.dat",
        "asset_2/20190102__asset_2.dat"
    ]
    for file in mock_files:
        if "/" in file:
            safe_mmkdir(os.path.join(base_directory, file.split("/")[0]))
        open(os.path.join(base_directory, file), "w").close()

    # If we have files, we should see them as individual assets
    subdir_reader_generator = SubdirReaderBatchKwargsGenerator(
        "test_generator",
        datasource=basic_pandas_datasource,
        base_directory=base_directory,
        reader_method='csv',
        known_extensions=['.dat'])
    batch_kwargs = next(subdir_reader_generator.get_iterator('asset_2'))
    assert batch_kwargs['reader_method'] == 'csv'
コード例 #4
0
def test_subdir_reader_file_partitioning(basic_pandas_datasource,
                                         tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_subdir_reader_file_partitioning"))
    mock_files = [
        "20190101__asset_1.csv",
        "20190102__asset_1.csv",
        "20190103__asset_1.csv",
        "asset_2/20190101__asset_2.csv",
        "asset_2/20190102__asset_2.csv",
    ]
    for file in mock_files:
        if "/" in file:
            os.makedirs(os.path.join(base_directory,
                                     file.split("/")[0]),
                        exist_ok=True)
        open(os.path.join(base_directory, file), "w").close()

    # If we have files, we should see them as individual assets
    subdir_reader_generator = SubdirReaderBatchKwargsGenerator(
        "test_generator",
        datasource=basic_pandas_datasource,
        base_directory=base_directory,
    )

    known_assets = subdir_reader_generator.get_available_data_asset_names(
    )["names"]
    assert set(known_assets) == {
        ("20190101__asset_1", "file"),
        ("20190102__asset_1", "file"),
        ("20190103__asset_1", "file"),
        ("asset_2", "directory"),
    }

    # SubdirReaderBatchKwargsGenerator uses the filename as partition name for root files
    known_partitions = subdir_reader_generator.get_available_partition_ids(
        data_asset_name="20190101__asset_1")
    assert set(known_partitions) == {"20190101__asset_1"}

    kwargs = subdir_reader_generator.build_batch_kwargs(
        data_asset_name="20190101__asset_1", partition_id="20190101__asset_1")
    assert kwargs["path"] == os.path.join(base_directory,
                                          "20190101__asset_1.csv")

    # We should also be able to pass a limit
    kwargs = subdir_reader_generator.build_batch_kwargs(
        data_asset_name="20190101__asset_1",
        partition_id="20190101__asset_1",
        limit=10)
    assert kwargs["path"] == os.path.join(base_directory,
                                          "20190101__asset_1.csv")
    assert kwargs["reader_options"]["nrows"] == 10
コード例 #5
0
def test_subdir_reader_path_partitioning(basic_pandas_datasource,
                                         tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_subdir_reader_path_partitioning"))
    mock_files = [
        "asset_1/20190101__asset_1.csv",
        "asset_1/20190102__asset_1.csv",
        "asset_1/20190103__asset_1.csv",
        "asset_2/20190101__asset_2.csv",
        "asset_2/20190102__asset_2.csv",
    ]
    for file in mock_files:
        os.makedirs(os.path.join(base_directory,
                                 file.split("/")[0]),
                    exist_ok=True)
        open(os.path.join(base_directory, file), "w").close()

    subdir_reader_generator = SubdirReaderBatchKwargsGenerator(
        "test_generator",
        datasource=basic_pandas_datasource,
        base_directory=base_directory,
    )

    # We should see two assets
    known_assets = subdir_reader_generator.get_available_data_asset_names(
    )["names"]
    # Use set in test to avoid order issues
    assert set(known_assets) == {("asset_2", "directory"),
                                 ("asset_1", "directory")}

    # We should see three partitions for the first:
    known_partitions = subdir_reader_generator.get_available_partition_ids(
        data_asset_name="asset_1")
    assert set(known_partitions) == {
        "20190101__asset_1",
        "20190102__asset_1",
        "20190103__asset_1",
    }

    asset_1_kwargs = [
        kwargs for kwargs in subdir_reader_generator.get_iterator(
            data_asset_name="asset_1")
    ]
    asset_2_kwargs = [
        kwargs for kwargs in subdir_reader_generator.get_iterator(
            data_asset_name="asset_2")
    ]
    with pytest.raises(BatchKwargsError):
        not_an_asset_kwargs = [
            kwargs for kwargs in subdir_reader_generator.get_iterator(
                data_asset_name="not_an_asset")
        ]

    assert len(asset_1_kwargs) == 3
    paths = [kwargs["path"] for kwargs in asset_1_kwargs]
    assert set(paths) == {
        os.path.join(base_directory, "asset_1/20190101__asset_1.csv"),
        os.path.join(base_directory, "asset_1/20190102__asset_1.csv"),
        os.path.join(base_directory, "asset_1/20190103__asset_1.csv"),
    }
    partitions = subdir_reader_generator.get_available_partition_ids(
        data_asset_name="asset_1")

    # SubdirReaderBatchKwargsGenerator uses filenames from subdirectories to generate partition names
    assert set(partitions) == {
        "20190101__asset_1",
        "20190102__asset_1",
        "20190103__asset_1",
    }
    assert len(asset_1_kwargs[0].keys()) == 2

    assert len(asset_2_kwargs) == 2
    paths = [kwargs["path"] for kwargs in asset_2_kwargs]
    assert set(paths) == {
        os.path.join(base_directory, "asset_2/20190101__asset_2.csv"),
        os.path.join(base_directory, "asset_2/20190102__asset_2.csv"),
    }
    partitions = subdir_reader_generator.get_available_partition_ids(
        data_asset_name="asset_2")
    assert set(partitions) == {("20190101__asset_2"), ("20190102__asset_2")}
    assert len(asset_2_kwargs[0].keys()) == 2