Esempio n. 1
0
def test_cache(path, mocker):
    df = pd.DataFrame({"x": [1, 2, 3]})
    ds = DataSource(path("0_0.csv"), expire=timedelta(seconds=10))
    mtime = int(os.path.getmtime(path("0_0.csv")))
    cache = InMemoryCache()
    cache.set(ds.hash, value=df, mtime=mtime)

    assert ds.get_df().shape == (2, 2)  # without cache: read from disk
    assert ds.get_df(cache=cache).shape == (3, 1)  # retrieved from cache

    mock_time = mocker.patch("peakina.cache.time")
    mock_time.return_value = time.time() + 15  # fake 15s elapsed
    assert ds.get_df(cache=cache).shape == (
        2, 2)  # 15 > 10: cache expires: read from disk
    assert cache.get(ds.hash).shape == (
        2, 2)  # cache has been updated with the new data

    mock_time.reset_mock()
    cache.set(ds.hash, value=df, mtime=mtime)  # put back the fake df
    assert ds.get_df(cache=cache).shape == (3, 1
                                            )  # back to "retrieved from cache"
    # fake a file with a different mtime (e.g: a new file has been uploaded):
    mocker.patch("peakina.io.local.file_fetcher.os.path.getmtime"
                 ).return_value = mtime - 1
    assert ds.get_df(cache=cache).shape == (2, 2)  # cache has been invalidated
Esempio n. 2
0
def test_csv_with_sep(path):
    """It should be able to detect separator if not set"""
    ds = DataSource(path('0_0_sep.csv'))
    assert ds.get_df().shape == (2, 2)

    ds = DataSource(path('0_0_sep.csv'), extra_kwargs={'sep': ','})
    assert ds.get_df().shape == (2, 1)
Esempio n. 3
0
def test_s3(s3_endpoint_url):
    dirpath = "s3://accessKey1:verySecretKey1@mybucket"

    ds = DataSource(
        f"{dirpath}/0_0.csv",
        fetcher_kwargs={"client_kwargs": {
            "endpoint_url": s3_endpoint_url
        }},
    )
    assert ds.get_df().shape == (2, 2)

    ds = DataSource(
        f"{dirpath}/0_*.csv",
        match=MatchEnum.GLOB,
        fetcher_kwargs={"client_kwargs": {
            "endpoint_url": s3_endpoint_url
        }},
    )
    assert ds.get_df().shape == (4, 3)

    # With subdirectories
    ds = DataSource(
        f"{dirpath}/mydir/0_*.csv",
        match=MatchEnum.GLOB,
        fetcher_kwargs={"client_kwargs": {
            "endpoint_url": s3_endpoint_url
        }},
    )
    assert ds.get_df().shape == (4, 3)
Esempio n. 4
0
def test_csv_with_sep(path):
    """It should be able to detect separator if not set"""
    ds = DataSource(path("0_0_sep.csv"))
    assert ds.get_df().shape == (2, 2)

    ds = DataSource(path("0_0_sep.csv"), reader_kwargs={"sep": ","})
    assert ds.get_df().shape == (2, 1)
Esempio n. 5
0
def test_simple_csv(path):
    """It should be able to detect type if not set"""
    ds = DataSource(path('0_0.csv'), extra_kwargs={'encoding': 'utf8', 'sep': ','})
    assert ds.get_df().shape == (2, 2)

    with pytest.raises(Exception):
        DataSource(path('0_0.csv'), type='excel', encoding='utf8', sep=',').get_df()
Esempio n. 6
0
def test_chunk_match(path):
    """It should be able to retrieve a dataframe with chunks and match"""
    ds = DataSource(path('0_*.csv'), match='glob', extra_kwargs={'chunksize': 1})
    assert all(df.shape == (1, 3) for df in ds.get_dfs())
    df = ds.get_df()
    assert df.shape == (6, 3)
    assert '__filename__' in df.columns
Esempio n. 7
0
def test_basic_xml(path):
    """It should apply optional jq filter when extracting an xml datasource"""
    # No jq filter -> everything is in one cell
    assert DataSource(path("fixture.xml")).get_df().shape == (1, 1)

    jq_filter = ".records"
    ds = DataSource(path("fixture.xml"), reader_kwargs={"filter": jq_filter})
    assert ds.get_df().shape == (2, 1)

    jq_filter = '.records .record[] | .["@id"]|=tonumber'
    ds = DataSource(path("fixture.xml"), reader_kwargs={"filter": jq_filter})
    df = pd.DataFrame({
        "@id": [1, 2],
        "title": ["Keep on dancin'", "Small Talk"]
    })
    assert ds.get_df().equals(df)
Esempio n. 8
0
def test_match_different_file_types(path):
    """It should be able to match even different types, encodings or seps"""
    ds = DataSource(path("0_*"), match=MatchEnum.GLOB)
    df = ds.get_df()
    assert set(df["__filename__"]) == {
        "0_0.csv", "0_0_sep.csv", "0_1.csv", "0_2.xls"
    }
    assert df.shape == (8, 3)
Esempio n. 9
0
def test_simple_csv(path):
    """It should be able to detect type if not set"""
    ds = DataSource(path("0_0.csv"),
                    reader_kwargs={
                        "encoding": "utf8",
                        "sep": ","
                    })
    assert ds.get_df().shape == (2, 2)
Esempio n. 10
0
def test_chunk_match(path):
    """It should be able to retrieve a dataframe with chunks and match"""
    ds = DataSource(path("0_*.csv"),
                    match=MatchEnum.GLOB,
                    reader_kwargs={"chunksize": 1})
    assert all(df.shape == (1, 3) for df in ds.get_dfs())
    df = ds.get_df()
    assert df.shape == (6, 3)
    assert "__filename__" in df.columns
Esempio n. 11
0
def test_basic_excel(path):
    """It should not add a __sheet__ column when retrieving a single sheet"""
    ds = DataSource(path("fixture-multi-sheet.xlsx"))
    df = pd.DataFrame({"Month": [1], "Year": [2019]})
    assert ds.get_df().equals(df)
    assert ds.get_metadata() == {"sheetnames": ["January", "February"]}

    # On match datasources, no metadata is returned:
    assert DataSource(path("fixture-multi-sh*t.xlsx"),
                      match=MatchEnum.GLOB).get_metadata() == {}
Esempio n. 12
0
def test_multi_sheets_excel(path):
    """It should add a __sheet__ column when retrieving multiple sheet"""
    ds = DataSource(path("fixture-multi-sheet.xlsx"),
                    reader_kwargs={"sheet_name": None})
    df = pd.DataFrame({
        "Month": [1, 2],
        "Year": [2019, 2019],
        "__sheet__": ["January", "February"]
    })
    assert ds.get_df().equals(df)
Esempio n. 13
0
def test_ftp(ftp_path):
    ds = DataSource(f"{ftp_path}/sales.csv")
    assert ds.get_df().shape == (208, 15)
Esempio n. 14
0
def test_multi_sheets_excel(path):
    """It should not add a __sheet__ column when retrieving a single sheet"""
    ds = DataSource(path('fixture-multi-sheet.xlsx'), extra_kwargs={'sheet_name': None})
    df = pd.DataFrame({'Month': [1, 2], 'Year': [2019, 2019], '__sheet__': ['January', 'February']})
    assert ds.get_df().equals(df)
Esempio n. 15
0
def test_chunk(path):
    """It should be able to retrieve a dataframe with chunks"""
    ds = DataSource(path("0_0.csv"), reader_kwargs={"chunksize": 1})
    assert all(df.shape == (1, 2) for df in ds.get_dfs())
    assert ds.get_df().shape == (2, 2)
Esempio n. 16
0
def test_chunk(path):
    """It should be able to retrieve a dataframe with chunks"""
    ds = DataSource(path('0_0.csv'), extra_kwargs={'chunksize': 1})
    assert all(df.shape == (1, 2) for df in ds.get_dfs())
    assert ds.get_df().shape == (2, 2)
Esempio n. 17
0
def test_ftp_match(ftp_path):
    ds = DataSource(f"{ftp_path}/my_data_\\d{{4}}\\.csv$",
                    match=MatchEnum.REGEX)
    assert ds.get_df().shape == (8, 3)
Esempio n. 18
0
def test_csv_with_sep_and_encoding(path):
    """It should be able to detect everything"""
    ds = DataSource(path('latin_1_sep.csv'))
    assert ds.get_df().shape == (2, 7)
Esempio n. 19
0
def test_basic_excel(path):
    """It should not add a __sheet__ column when retrieving a single sheet"""
    ds = DataSource(path('fixture-multi-sheet.xlsx'))
    df = pd.DataFrame({'Month': [1], 'Year': [2019]})
    assert ds.get_df().equals(df)
Esempio n. 20
0
def test_csv_with_sep_and_encoding(path):
    """It should be able to detect everything"""
    ds = DataSource(path("latin_1_sep.csv"))
    assert ds.get_df().shape == (2, 7)
Esempio n. 21
0
def test_ftp(ftp_path):
    ds = DataSource(f'{ftp_path}/sales.csv')
    assert ds.get_df().shape == (208, 15)
Esempio n. 22
0
def test_match_different_file_types(path):
    """It should be able to match even different types, encodings or seps"""
    ds = DataSource(path('0_*'), match='glob')
    df = ds.get_df()
    assert set(df['__filename__']) == {'0_0.csv', '0_0_sep.csv', '0_1.csv', '0_2.xls'}
    assert df.shape == (8, 3)
Esempio n. 23
0
def test_match(path):
    """It should be able to concat files matching a pattern"""
    ds = DataSource(path(r'0_\d.csv'), match='regex')
    df = ds.get_df()
    assert set(df['__filename__']) == {'0_0.csv', '0_1.csv'}
    assert df.shape == (4, 3)
Esempio n. 24
0
def test_match(path):
    """It should be able to concat files matching a pattern"""
    ds = DataSource(path(r"0_\d.csv"), match=MatchEnum.REGEX)
    df = ds.get_df()
    assert set(df["__filename__"]) == {"0_0.csv", "0_1.csv"}
    assert df.shape == (4, 3)
Esempio n. 25
0
def test_ftp_match(ftp_path):
    ds = DataSource(f'{ftp_path}/my_data_\\d{{4}}\\.csv$', match='regex')
    assert ds.get_df().shape == (8, 3)