Exemple #1
0
def test_from_pandas_datetime():
    df = pd.DataFrame({
        "datetime": ["2021-01-01", "2021-01-02"],
        "foo": [1, 2]
    })
    df["datetime"] = pd.to_datetime(df["datetime"])
    pl.from_pandas(df)
Exemple #2
0
def test_from_pandas_dataframe() -> None:
    pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
    df = pl.from_pandas(pd_df)
    assert df.shape == (2, 3)

    # if not a pandas dataframe, raise a ValueError
    with pytest.raises(ValueError):
        _ = pl.from_pandas([1, 2])  # type: ignore
Exemple #3
0
def test_from_pandas_nan_to_none() -> None:
    from pyarrow import ArrowInvalid

    df = pd.Series([2, np.nan, None], name="pd")
    out_true = pl.from_pandas(df)
    out_false = pl.from_pandas(df, nan_to_none=False)
    df.loc[2] = pd.NA
    assert [val is None for val in out_true]
    assert [np.isnan(val) for val in out_false[1:]]
    with pytest.raises(ArrowInvalid, match="Could not convert"):
        pl.from_pandas(df, nan_to_none=False)
Exemple #4
0
def test_from_optional_not_available() -> None:
    with patch("polars.convert._NUMPY_AVAILABLE", False):
        with pytest.raises(ImportError):
            pl.from_numpy(np.array([[1, 2], [3, 4]]), columns=["a", "b"])
    with patch("polars.convert._PYARROW_AVAILABLE", False):
        with pytest.raises(ImportError):
            pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}))
        with pytest.raises(ImportError):
            pl.from_pandas(pd.Series([1, 2, 3]))
    with patch("polars.convert._PANDAS_AVAILABLE", False):
        with pytest.raises(ImportError):
            pl.from_pandas(pd.Series([1, 2, 3]))
Exemple #5
0
def test_from_pandas_nan_to_none() -> None:
    from pyarrow import ArrowInvalid

    df = pd.DataFrame({
        "bools_nulls": [None, True, False],
        "int_nulls": [1, None, 3],
        "floats_nulls": [1.0, None, 3.0],
        "strings_nulls": ["foo", None, "ham"],
        "nulls": [None, np.nan, np.nan],
    })
    out_true = pl.from_pandas(df)
    out_false = pl.from_pandas(df, nan_to_none=False)
    df.loc[2, "nulls"] = pd.NA
    assert all(val is None for val in out_true["nulls"])
    assert all(np.isnan(val) for val in out_false["nulls"][1:])
    with pytest.raises(ArrowInvalid, match="Could not convert"):
        pl.from_pandas(df, nan_to_none=False)

    df = pd.Series([2, np.nan, None], name="pd")  # type: ignore
    out_true = pl.from_pandas(df)
    out_false = pl.from_pandas(df, nan_to_none=False)
    df.loc[2] = pd.NA
    assert [val is None for val in out_true]
    assert [np.isnan(val) for val in out_false[1:]]
    with pytest.raises(ArrowInvalid, match="Could not convert"):
        pl.from_pandas(df, nan_to_none=False)
Exemple #6
0
def test_from_pandas_nested_list() -> None:
    # this panicked in https://github.com/pola-rs/polars/issues/1615
    pddf = pd.DataFrame(
        {"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]}
    )
    pldf = pl.from_pandas(pddf)
    assert pldf.shape == (4, 2)
Exemple #7
0
def test_from_pandas_datetime():
    ts = datetime.datetime(2021, 1, 1, 20, 20, 20, 20)
    s = pd.Series([ts, ts])
    s = pl.from_pandas(s.to_frame("a"))["a"]
    assert s.hour()[0] == 20
    assert s.minute()[0] == 20
    assert s.second()[0] == 20
def build_gene_annotation_df(pset_dict):
    """
    Build a table mapping each gene in a dataset to its gene annotations.
    @param pset_dict: [`dict`] A nested dictionary containing all tables in the PSet
    @return: [`DataFrame`] A table of all gene annotations, mapped to genes
    """
    # Extract the all molecular data types for the pSet
    df_list = [
        pl.from_pandas(pset_dict['molecularProfiles'][mDataType]['rowData'])
        for mDataType in pset_dict['molecularProfiles']
    ]
    # Get columns of interest, add columns needed later
    for i in range(len(df_list)):
        df_list[i] = df_list[i].select(['.features'])
        empty_column = [None for _ in range(len(df_list[i]['.features']))]
        df_list[i]['symbol'] = pl.Series('symbol', empty_column, dtype=pl.Utf8)
        df_list[i]['gene_seq_start'] = pl.Series('gene_seq_start',
                                                 empty_column,
                                                 dtype=pl.Int64)
        df_list[i]['gene_seq_end'] = pl.Series('gene_seq_end',
                                               empty_column,
                                               dtype=pl.Int64)
    # Merge to a single DataFrame
    gene_annotation_df = pl.concat(df_list) \
        .rename({'.features': 'gene_id'})
    # Remove Ensembl gene version
    gene_annotation_df['gene_id'] = gene_annotation_df['gene_id'] \
        .apply(lambda x: re.sub(r'\..*$', '', x))
    gene_annotation_df = gene_annotation_df \
        .drop_duplicates() \
        .to_pandas()
    return gene_annotation_df
Exemple #9
0
def test_struct_to_pandas() -> None:
    df = pd.DataFrame([{"a": {"b": {"c": 2}}}])
    pl_df = pl.from_pandas(df)

    assert isinstance(pl_df.dtypes[0], pl.datatypes.Struct)

    assert pl_df.to_pandas().equals(df)
Exemple #10
0
def test_from_empty_pandas() -> None:
    pandas_df = pd.DataFrame({
        "A": [],
        "fruits": [],
    })

    polars_df = pl.from_pandas(pandas_df)
    assert polars_df.columns == ["A", "fruits"]
    assert polars_df.dtypes == [pl.Float64, pl.Float64]
Exemple #11
0
def test_from_pandas_ns_resolution() -> None:
    df = pd.DataFrame(
        [
            pd.Timestamp(
                year=2021, month=1, day=1, hour=1, second=1, nanosecond=1)
        ],
        columns=["date"],
    )
    assert pl.from_pandas(df)[0, 0] == datetime(2021, 1, 1, 1, 0, 1)
Exemple #12
0
def test_from_pandas_datetime() -> None:
    ts = datetime(2021, 1, 1, 20, 20, 20, 20)
    pl_s = pd.Series([ts, ts])
    tmp = pl.from_pandas(pl_s.to_frame("a"))
    s = tmp["a"]
    assert s.dt.hour()[0] == 20
    assert s.dt.minute()[0] == 20
    assert s.dt.second()[0] == 20

    date_times = pd.date_range(
        "2021-06-24 00:00:00", "2021-06-24 10:00:00", freq="1H", closed="left"
    )
    s = pl.from_pandas(date_times)
    assert s[0] == datetime(2021, 6, 24, 0, 0)
    assert s[-1] == datetime(2021, 6, 24, 9, 0)

    df = pd.DataFrame({"datetime": ["2021-01-01", "2021-01-02"], "foo": [1, 2]})
    df["datetime"] = pd.to_datetime(df["datetime"])
    pl.from_pandas(df)
Exemple #13
0
def test_cast_inner() -> None:
    a = pl.Series([[1, 2]])
    for t in [bool, pl.Boolean]:
        b = a.cast(pl.List(t))
        assert b.dtype == pl.List(pl.Boolean)
        assert b.to_list() == [[True, True]]

    # this creates an inner null type
    df = pl.from_pandas(pd.DataFrame(data=[[[]], [[]]], columns=["A"]))
    assert df["A"].cast(pl.List(int)).dtype.inner == pl.Int64  # type: ignore[arg-type, attr-defined]
Exemple #14
0
def test_from_pandas_datetime():
    ts = datetime.datetime(2021, 1, 1, 20, 20, 20, 20)
    s = pd.Series([ts, ts])
    s = pl.from_pandas(s.to_frame("a"))["a"]
    assert s.dt.hour()[0] == 20
    assert s.dt.minute()[0] == 20
    assert s.dt.second()[0] == 20

    date_times = pd.date_range("2021-06-24 00:00:00",
                               "2021-06-24 10:00:00",
                               freq="1H",
                               closed="left")
    s = pl.from_pandas(date_times)
    assert s[0] == 1624492800000
    assert s[-1] == 1624525200000
    # checks dispatch
    s.dt.round("hour", 2)
    s.dt.round("day", 5)

    # checks lazy dispatch
    pl.DataFrame([s.rename("foo")])[pl.col("foo").dt.round("hour", 2)]
Exemple #15
0
def test_join_dates():
    date_times = pd.date_range("2021-06-24 00:00:00",
                               "2021-06-24 10:00:00",
                               freq="1H",
                               closed="left")
    dts = (pl.from_pandas(date_times).apply(
        lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60)).cast(
            pl.Date64))

    # some df with sensor id, (randomish) datetime and some value
    df = pl.DataFrame({
        "sensor": ["a"] * 5 + ["b"] * 5,
        "datetime": dts,
        "value": [2, 3, 4, 1, 2, 3, 5, 1, 2, 3],
    })
    df.join(df, on="datetime")
Exemple #16
0
def test_from_pandas():
    df = pd.DataFrame({
        "bools": [False, True, False],
        "bools_nulls": [None, True, False],
        "int": [1, 2, 3],
        "int_nulls": [1, None, 3],
        "floats": [1.0, 2.0, 3.0],
        "floats_nulls": [1.0, None, 3.0],
        "strings": ["foo", "bar", "ham"],
        "strings_nulls": ["foo", None, "ham"],
        "strings-cat": ["foo", "bar", "ham"],
    })
    df["strings-cat"] = df["strings-cat"].astype("category")

    out = pl.from_pandas(df)
    assert out.shape == (3, 9)
Exemple #17
0
def test_struct_logical_types_to_pandas() -> None:
    timestamp = datetime(2022, 1, 1)
    df = pd.DataFrame([{"struct": {"timestamp": timestamp}}])
    assert pl.from_pandas(df).dtypes == [pl.Struct]
Exemple #18
0
def test_from_null_column() -> None:
    assert pl.from_pandas(pd.DataFrame(data=[pd.NA, pd.NA])).shape == (2, 1)
Exemple #19
0
def test_from_empty_pandas_strings() -> None:
    df = pd.DataFrame(columns=["a", "b"])
    df["a"] = df["a"].astype(str)
    df["b"] = df["b"].astype(float)
    df_pl = pl.from_pandas(df)
    assert df_pl.dtypes == [pl.Utf8, pl.Float64]
Exemple #20
0
def test_from_pandas_series() -> None:
    pd_series = pd.Series([1, 2, 3], name="pd")
    df = pl.from_pandas(pd_series)
    assert df.shape == (3, )
Exemple #21
0
def test_from_pandas_categorical_none() -> None:
    s = pd.Series(["a", "b", "c", pd.NA], dtype="category")
    out = pl.from_pandas(s)
    assert out.dtype == pl.Categorical
    assert out.to_list() == ["a", "b", "c", None]