Exemple #1
0
    def test_apply_with_new_dataframe(self):
        pdf = pd.DataFrame({
            "timestamp": [0.0, 0.5, 1.0, 0.0, 0.5],
            "car_id": ['A', 'A', 'A', 'B', 'B']
        })
        kdf = koalas.DataFrame(pdf)

        self.assert_eq(
            kdf.groupby('car_id').apply(
                lambda _: pd.DataFrame({"column": [0.0]})).sort_index(),
            pdf.groupby('car_id').apply(
                lambda _: pd.DataFrame({"column": [0.0]})).sort_index())

        set_option('compute.shortcut_limit', 1000)
        try:
            # 1000+ records will only infer the schema.
            pdf = pd.DataFrame({
                "timestamp": [0.0, 0.5, 1.0, 0.0, 0.5] * 300,
                "car_id": ['A', 'A', 'A', 'B', 'B'] * 300
            })
            kdf = koalas.DataFrame(pdf)

            self.assert_eq(
                kdf.groupby('car_id').apply(
                    lambda _: pd.DataFrame({"column": [0.0]})).sort_index(),
                pdf.groupby('car_id').apply(
                    lambda _: pd.DataFrame({"column": [0.0]})).sort_index())
        finally:
            reset_option('compute.shortcut_limit')
Exemple #2
0
    def test_axis_on_dataframe(self):
        # The number of each count is intentionally big
        # because when data is small, it executes a shortcut.
        # Less than 'compute.shortcut_limit' will execute a shortcut
        # by using collected pandas dataframe directly.
        # now we set the 'compute.shortcut_limit' as 1000 explicitly
        set_option('compute.shortcut_limit', 1000)

        try:
            pdf = pd.DataFrame({
                'A': [1, -2, 3, -4, 5] * 300,
                'B': [1., -2, 3, -4, 5] * 300,
                'C': [-6., -7, -8, -9, 10] * 300,
                'D': [True, False, True, False, False] * 300
            })
            kdf = ks.from_pandas(pdf)
            self.assert_eq(kdf.count(axis=1), pdf.count(axis=1))
            self.assert_eq(kdf.var(axis=1), pdf.var(axis=1))
            self.assert_eq(kdf.std(axis=1), pdf.std(axis=1))
            self.assert_eq(kdf.max(axis=1), pdf.max(axis=1))
            self.assert_eq(kdf.min(axis=1), pdf.min(axis=1))
            self.assert_eq(kdf.sum(axis=1), pdf.sum(axis=1))
            self.assert_eq(kdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
            self.assert_eq(kdf.skew(axis=1), pdf.skew(axis=1))
            self.assert_eq(kdf.mean(axis=1), pdf.mean(axis=1))
        finally:
            reset_option('compute.shortcut_limit')
Exemple #3
0
 def test_sampled_plot_with_ratio(self):
     set_option('plotting.sample_ratio', 0.5)
     try:
         pdf = pd.DataFrame(np.random.rand(2500, 4), columns=['a', 'b', 'c', 'd'])
         kdf = koalas.from_pandas(pdf)
         data = SampledPlot().get_sampled(kdf)
         self.assertEqual(round(len(data) / 2500, 1), 0.5)
     finally:
         set_option('plotting.sample_ratio', DataFramePlotTest.sample_ratio_default)
Exemple #4
0
    def test_transform(self):
        pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
                            'b': [1, 1, 2, 3, 5, 8],
                            'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kdf.groupby("b").transform(lambda x: x + 1).sort_index(),
                       pdf.groupby("b").transform(lambda x: x + 1).sort_index())
        self.assert_eq(kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(),
                       pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index())
        self.assert_eq(kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(),
                       pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index())

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')])
        pdf.columns = columns
        kdf.columns = columns

        self.assert_eq(kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(),
                       pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index())
        self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')])
                       .transform(lambda x: x * x).sort_index(),
                       pdf.groupby([('x', 'a'), ('x', 'b')])
                       .transform(lambda x: x * x).sort_index())

        set_option('compute.shortcut_limit', 1000)
        try:
            pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 300,
                                'b': [1, 1, 2, 3, 5, 8] * 300,
                                'c': [1, 4, 9, 16, 25, 36] * 300}, columns=['a', 'b', 'c'])
            kdf = ks.from_pandas(pdf)
            self.assert_eq(kdf.groupby("b").transform(lambda x: x + 1).sort_index(),
                           pdf.groupby("b").transform(lambda x: x + 1).sort_index())
            self.assert_eq(kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(),
                           pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index())
            self.assert_eq(kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(),
                           pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index())
            with self.assertRaisesRegex(TypeError, "<class 'int'> object is not callable"):
                kdf.groupby("b").transform(1)

            # multi-index columns
            columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')])
            pdf.columns = columns
            kdf.columns = columns

            self.assert_eq(kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(),
                           pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index())
            self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')])
                           .transform(lambda x: x * x).sort_index(),
                           pdf.groupby([('x', 'a'), ('x', 'b')])
                           .transform(lambda x: x * x).sort_index())
        finally:
            reset_option('compute.shortcut_limit')
Exemple #5
0
    def test_html_repr(self):
        kdf = ks.range(ReprTests.max_display_count)
        self.assertTrue("Showing only the first" not in kdf._repr_html_())
        self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_())

        kdf = ks.range(ReprTests.max_display_count + 1)
        self.assertTrue("Showing only the first" in kdf._repr_html_())

        set_option("display.max_rows", None)
        try:
            kdf = ks.range(ReprTests.max_display_count + 1)
            self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_())
        finally:
            set_option("display.max_rows", ReprTests.max_display_count)
Exemple #6
0
    def test_repr_series(self):
        kser = ks.range(ReprTests.max_display_count).id
        self.assertTrue("Showing only the first" not in repr(kser))
        self.assert_eq(repr(kser), repr(kser.to_pandas()))

        kser = ks.range(ReprTests.max_display_count + 1).id
        self.assertTrue("Showing only the first" in repr(kser))

        set_option("display.max_rows", None)
        try:
            kser = ks.range(ReprTests.max_display_count + 1).id
            self.assert_eq(repr(kser), repr(kser.to_pandas()))
        finally:
            set_option("display.max_rows", ReprTests.max_display_count)
Exemple #7
0
    def test_repr_dataframe(self):
        kdf = ks.range(ReprTests.max_display_count)
        self.assertTrue("Showing only the first" not in repr(kdf))
        self.assert_eq(repr(kdf), repr(kdf.to_pandas()))

        kdf = ks.range(ReprTests.max_display_count + 1)
        self.assertTrue("Showing only the first" in repr(kdf))

        set_option("display.max_rows", None)
        try:
            kdf = ks.range(ReprTests.max_display_count + 1)
            self.assert_eq(repr(kdf), repr(kdf.to_pandas()))
        finally:
            set_option("display.max_rows", ReprTests.max_display_count)
Exemple #8
0
    def test_apply(self):
        pdf = pd.DataFrame(
            {
                'a': [1, 2, 3, 4, 5, 6],
                'b': [1, 1, 2, 3, 5, 8],
                'c': [1, 4, 9, 16, 25, 36]
            },
            columns=['a', 'b', 'c'])
        kdf = koalas.DataFrame(pdf)
        self.assert_eq(
            kdf.groupby("b").apply(lambda x: x + 1).sort_index(),
            pdf.groupby("b").apply(lambda x: x + 1).sort_index())
        self.assert_eq(
            kdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index(),
            pdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index())
        self.assert_eq(
            kdf.groupby(['b'])['a'].apply(lambda x: x).sort_index(),
            pdf.groupby(['b'])['a'].apply(lambda x: x).sort_index())

        # Less than 'compute.shortcut_limit' will execute a shortcut
        # by using collected pandas dataframe directly.
        # now we set the 'compute.shortcut_limit' as 1000 explicitly
        set_option('compute.shortcut_limit', 1000)
        try:
            pdf = pd.DataFrame(
                {
                    'a': [1, 2, 3, 4, 5, 6] * 300,
                    'b': [1, 1, 2, 3, 5, 8] * 300,
                    'c': [1, 4, 9, 16, 25, 36] * 300
                },
                columns=['a', 'b', 'c'])
            kdf = koalas.DataFrame(pdf)
            self.assert_eq(
                kdf.groupby("b").apply(lambda x: x + 1).sort_index(),
                pdf.groupby("b").apply(lambda x: x + 1).sort_index())
            self.assert_eq(
                kdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index(),
                pdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index())
            self.assert_eq(
                kdf.groupby(['b'])['a'].apply(lambda x: x).sort_index(),
                pdf.groupby(['b'])['a'].apply(lambda x: x).sort_index())
            with self.assertRaisesRegex(
                    TypeError, "<class 'int'> object is not callable"):
                kdf.groupby("b").apply(1)
        finally:
            reset_option('compute.shortcut_limit')
Exemple #9
0
    def test_repr_indexes(self):
        kdf = ks.range(ReprTests.max_display_count)
        kidx = kdf.index
        self.assertTrue("Showing only the first" not in repr(kidx))
        self.assert_eq(repr(kidx), repr(kidx.to_pandas()))

        kdf = ks.range(ReprTests.max_display_count + 1)
        kidx = kdf.index
        self.assertTrue("Showing only the first" in repr(kidx))

        set_option("display.max_rows", None)
        try:
            kdf = ks.range(ReprTests.max_display_count + 1)
            kidx = kdf.index
            self.assert_eq(repr(kidx), repr(kidx.to_pandas()))
        finally:
            set_option("display.max_rows", ReprTests.max_display_count)
Exemple #10
0
    def test_transform(self):
        pdf = pd.DataFrame(
            {
                'a': [1, 2, 3, 4, 5, 6],
                'b': [1, 1, 2, 3, 5, 8],
                'c': [1, 4, 9, 16, 25, 36]
            },
            columns=['a', 'b', 'c'])
        kdf = koalas.DataFrame(pdf)
        self.assert_eq(
            kdf.groupby("b").transform(lambda x: x + 1).sort_index(),
            pdf.groupby("b").transform(lambda x: x + 1).sort_index())
        self.assert_eq(
            kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(),
            pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index())
        self.assert_eq(
            kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(),
            pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index())

        set_option('compute.shortcut_limit', 1000)
        try:
            pdf = pd.DataFrame(
                {
                    'a': [1, 2, 3, 4, 5, 6] * 300,
                    'b': [1, 1, 2, 3, 5, 8] * 300,
                    'c': [1, 4, 9, 16, 25, 36] * 300
                },
                columns=['a', 'b', 'c'])
            kdf = koalas.DataFrame(pdf)
            self.assert_eq(
                kdf.groupby("b").transform(lambda x: x + 1).sort_index(),
                pdf.groupby("b").transform(lambda x: x + 1).sort_index())
            self.assert_eq(
                kdf.groupby(['a',
                             'b']).transform(lambda x: x * x).sort_index(),
                pdf.groupby(['a',
                             'b']).transform(lambda x: x * x).sort_index())
            self.assert_eq(
                kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(),
                pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index())
            with self.assertRaisesRegex(
                    TypeError, "<class 'int'> object is not callable"):
                kdf.groupby("b").transform(1)
        finally:
            reset_option('compute.shortcut_limit')
 def setUpClass(cls):
     super().setUpClass()
     if LooseVersion(pd.__version__) >= LooseVersion("0.25"):
         pd.set_option("plotting.backend", "matplotlib")
     set_option("plotting.backend", "matplotlib")
     set_option("plotting.max_rows", 2000)
     set_option("plotting.sample_ratio", None)
 def setUpClass(cls):
     super().setUpClass()
     pd.set_option("plotting.backend", "plotly")
     set_option("plotting.backend", "plotly")
     set_option("plotting.max_rows", 1000)
     set_option("plotting.sample_ratio", None)
Exemple #13
0
 def setUpClass(cls):
     super(DataFramePlotTest, cls).setUpClass()
     set_option('plotting.max_rows', 2000)
model = load_model("runs:/{run_id}/model".format(run_id=run_info.run_uuid))

# Prédiction et Score
df = ks.DataFrame(X_test)
df["prediction"] = model.predict(df)

stop = datetime.now()

print("Temps préparation et inférence (ML) : ", (stop - start).seconds, "s")

# %%
##### 7e changement : Il faut donc recalculer le score nous même

from databricks.koalas.config import set_option, reset_option

set_option("compute.ops_on_diff_frames", True)

# Score : The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum()

reel = ks.Series(y_test).to_frame().rename(columns={0: 'Reel'})
result = ks.concat([df, reel], axis=1)

result['square_diff_true_pred'] = (result['Reel'] - result['prediction'])**2
u = result['square_diff_true_pred'].sum()
v = ((result['Reel'] - result['Reel'].mean())**2).sum()

score = (1 - u / v)
print(f"score: {score}")

# %% [markdown]
# ## Entrainement et inférence avec Pipeline
Exemple #15
0
 def setUpClass(cls):
     super(OpsOnDiffFramesDisabledTest, cls).setUpClass()
     set_option('compute.ops_on_diff_frames', False)
Exemple #16
0
 def setUpClass(cls):
     super(DistributedDefaultIndexTest, cls).setUpClass()
     set_option('compute.default_index_type', 'distributed')
Exemple #17
0
 def setUpClass(cls):
     super(DistributedOneByOneDefaultIndexTest, cls).setUpClass()
     set_option('compute.default_index_type', 'distributed-sequence')
Exemple #18
0
 def setUpClass(cls):
     super().setUpClass()
     set_option("display.max_rows", ReprTest.max_display_count)
 def setUpClass(cls):
     super().setUpClass()
     set_option("compute.ops_on_diff_frames", True)
Exemple #20
0
 def setUpClass(cls):
     super(DataFramePlotTest, cls).setUpClass()
     set_option("plotting.max_rows", 2000)
     set_option("plotting.sample_ratio", None)
Exemple #21
0
 def setUpClass(cls):
     super(SeriesPlotTest, cls).setUpClass()
     set_option('plotting.max_rows', 1000)
Exemple #22
0
 def setUpClass(cls):
     super().setUpClass()
     set_option("plotting.max_rows", 2000)
     set_option("plotting.sample_ratio", None)
Exemple #23
0
 def setUpClass(cls):
     super().setUpClass()
     set_option("plotting.max_rows", 1000)
Exemple #24
0
 def setUpClass(cls):
     set_option("display.max_rows", ReprTests.max_display_count)
 def setUpClass(cls):
     super(OpsOnDiffFramesGroupByTest, cls).setUpClass()
     set_option("compute.ops_on_diff_frames", True)
 def setUpClass(cls):
     super(OneByOneDefaultIndexTest, cls).setUpClass()
     set_option("compute.default_index_type", "sequence")