Exemple #1
0
def test_isnull(data):
    pandas_df = pandas.DataFrame(data)
    modin_df = pd.DataFrame(data)

    pandas_result = pandas.isnull(pandas_df)
    modin_result = pd.isnull(modin_df)
    df_equals(modin_result, pandas_result)

    modin_result = pd.isnull(pd.Series([1, np.nan, 2]))
    pandas_result = pandas.isnull(pandas.Series([1, np.nan, 2]))
    df_equals(modin_result, pandas_result)

    assert pd.isna(np.nan) == pandas.isna(np.nan)
# %%
import os
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
import ray
#ray.init(memory=5242880000)
import modin.pandas as ray_pd
%time  mray_df = ray_pd.read_csv("data/kiva_loans.csv")

# %%
%time df1 = pandas_df.append(pandas_df)
%time df2 = mdask_df.append(mdask_df)
%time df3 = mray_df.append(mray_df)

# %%
%time df1 = pandas_pd.concat([pandas_df for _ in range(5)])
%time df2 = dask_pd.concat([mdask_df for _ in range(5)])
%time df3 = ray_pd.concat([mray_df for _ in range(5)])

# %%
%time pandas_pd.isnull(pandas_df["use"])
%time dask_pd.isnull(mdask_df["use"])
%time ray_pd.isnull(mray_df["use"])

# %%
%time pandas_df.fillna(value=0)
%time mdask_df.fillna(value=0)
%time mray_df.fillna(value=0)

# %%
Exemple #3
0
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="SmallF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="MedF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="LargeF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
dataset = pd.get_dummies(dataset, columns=["Title"])
dataset = pd.get_dummies(dataset, columns=["Embarked"], prefix="Em")
dataset.head()
dataset["Cabin"].head()
dataset["Cabin"].describe()
dataset["Cabin"].isnull().sum()
dataset["Cabin"][dataset["Cabin"].notnull()].head()
dataset["Cabin"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in dataset["Cabin"]])
g = sns.countplot(dataset["Cabin"],
                  order=["A", "B", "C", "D", "E", "F", "G", "T", "X"])
g = sns.factorplot(
    y="Survived",
    x="Cabin",
    data=dataset,
    kind="bar",
    order=["A", "B", "C", "D", "E", "F", "G", "T", "X"],
)
g = g.set_ylabels("Survival Probability")
dataset = pd.get_dummies(dataset, columns=["Cabin"], prefix="Cabin")
dataset["Ticket"].head()
Ticket = []
for i in list(dataset.Ticket):
    if not i.isdigit():