def test_isnull(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) pandas_result = pandas.isnull(pandas_df) modin_result = pd.isnull(modin_df) df_equals(modin_result, pandas_result) modin_result = pd.isnull(pd.Series([1, np.nan, 2])) pandas_result = pandas.isnull(pandas.Series([1, np.nan, 2])) df_equals(modin_result, pandas_result) assert pd.isna(np.nan) == pandas.isna(np.nan)
# %% import os os.environ["MODIN_ENGINE"] = "ray" # Modin will use Ray import ray #ray.init(memory=5242880000) import modin.pandas as ray_pd %time mray_df = ray_pd.read_csv("data/kiva_loans.csv") # %% %time df1 = pandas_df.append(pandas_df) %time df2 = mdask_df.append(mdask_df) %time df3 = mray_df.append(mray_df) # %% %time df1 = pandas_pd.concat([pandas_df for _ in range(5)]) %time df2 = dask_pd.concat([mdask_df for _ in range(5)]) %time df3 = ray_pd.concat([mray_df for _ in range(5)]) # %% %time pandas_pd.isnull(pandas_df["use"]) %time dask_pd.isnull(mdask_df["use"]) %time ray_pd.isnull(mray_df["use"]) # %% %time pandas_df.fillna(value=0) %time mdask_df.fillna(value=0) %time mray_df.fillna(value=0) # %%
g = g.set_ylabels("Survival Probability") g = sns.factorplot(x="SmallF", y="Survived", data=dataset, kind="bar") g = g.set_ylabels("Survival Probability") g = sns.factorplot(x="MedF", y="Survived", data=dataset, kind="bar") g = g.set_ylabels("Survival Probability") g = sns.factorplot(x="LargeF", y="Survived", data=dataset, kind="bar") g = g.set_ylabels("Survival Probability") dataset = pd.get_dummies(dataset, columns=["Title"]) dataset = pd.get_dummies(dataset, columns=["Embarked"], prefix="Em") dataset.head() dataset["Cabin"].head() dataset["Cabin"].describe() dataset["Cabin"].isnull().sum() dataset["Cabin"][dataset["Cabin"].notnull()].head() dataset["Cabin"] = pd.Series( [i[0] if not pd.isnull(i) else "X" for i in dataset["Cabin"]]) g = sns.countplot(dataset["Cabin"], order=["A", "B", "C", "D", "E", "F", "G", "T", "X"]) g = sns.factorplot( y="Survived", x="Cabin", data=dataset, kind="bar", order=["A", "B", "C", "D", "E", "F", "G", "T", "X"], ) g = g.set_ylabels("Survival Probability") dataset = pd.get_dummies(dataset, columns=["Cabin"], prefix="Cabin") dataset["Ticket"].head() Ticket = [] for i in list(dataset.Ticket): if not i.isdigit():