Esempio n. 1
0
    def test_concat_column_axis(self):
        pdf1 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3])
        pdf2 = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]}, index=[1, 3, 5])
        kdf1 = ks.from_pandas(pdf1)
        kdf2 = ks.from_pandas(pdf2)

        kdf3 = kdf1.copy()
        kdf4 = kdf2.copy()
        pdf3 = pdf1.copy()
        pdf4 = pdf2.copy()

        columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")])
        pdf3.columns = columns
        kdf3.columns = columns

        columns = pd.MultiIndex.from_tuples([("X", "C"), ("X", "D")])
        pdf4.columns = columns
        kdf4.columns = columns

        pdf5 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3])
        pdf6 = pd.DataFrame({"C": [1, 2, 3]}, index=[1, 3, 5])
        kdf5 = ks.from_pandas(pdf5)
        kdf6 = ks.from_pandas(pdf6)

        ignore_indexes = [True, False]
        joins = ["inner", "outer"]

        objs = [
            ([kdf1.A, kdf2.C], [pdf1.A, pdf2.C]),
            ([kdf1, kdf2.C], [pdf1, pdf2.C]),
            ([kdf1.A, kdf2], [pdf1.A, pdf2]),
            ([kdf1.A, kdf2.C], [pdf1.A, pdf2.C]),
            ([kdf1.A, kdf1.A.rename("B")], [pdf1.A, pdf1.A.rename("B")]),
            ([kdf3[("X", "A")], kdf4[("X", "C")]], [pdf3[("X", "A")], pdf4[("X", "C")]]),
            ([kdf3, kdf4[("X", "C")]], [pdf3, pdf4[("X", "C")]]),
            ([kdf3[("X", "A")], kdf4], [pdf3[("X", "A")], pdf4]),
            ([kdf3, kdf4], [pdf3, pdf4]),
            ([kdf3[("X", "A")], kdf3[("X", "B")]], [pdf3[("X", "A")], pdf3[("X", "B")]],),
            (
                [kdf3[("X", "A")], kdf3[("X", "B")].rename("ABC")],
                [pdf3[("X", "A")], pdf3[("X", "B")].rename("ABC")],
            ),
            (
                [kdf3[("X", "A")].rename("ABC"), kdf3[("X", "B")]],
                [pdf3[("X", "A")].rename("ABC"), pdf3[("X", "B")]],
            ),
            ([kdf5, kdf6], [pdf5, pdf6]),
            ([kdf6, kdf5], [pdf6, pdf5]),
        ]

        for ignore_index, join in itertools.product(ignore_indexes, joins):
            for obj in objs:
                kdfs, pdfs = obj
                with self.subTest(ignore_index=ignore_index, join=join, objs=pdfs):
                    actual = ks.concat(kdfs, axis=1, ignore_index=ignore_index, join=join)
                    expected = pd.concat(pdfs, axis=1, ignore_index=ignore_index, join=join)
                    self.assert_eq(
                        repr(actual.sort_values(list(actual.columns)).reset_index(drop=True)),
                        repr(expected.sort_values(list(expected.columns)).reset_index(drop=True)),
                    )
    def y(self, Model):
        """ y for scikit-learn estimators
        """
        input_X_datasets, input_y_datasets, preprocess_X, preprocess_y = (
            Model.get_datasets_format())

        if input_y_datasets is None:
            datasets = None
        else:
            datasets = [
                getattr(self, dataset_name)
                for dataset_name in input_y_datasets
            ]

        if isinstance(self.claim_count, pd.DataFrame):
            return pd.concat(datasets, axis=1).to_numpy()
        else:
            return kl.concat(datasets, axis=1).to_numpy()
def get_ad_dict():
    train_ad = ks.read_csv("../data/train_preliminary/ad.csv")
    test_ad = ks.read_csv("../data/test/ad.csv")
    ad_info = ks.concat([train_ad, test_ad], axis=0)
    ad_info = ad_info.drop_duplicates()
    ad_dict_sql = '''
     select 
       creative_id,
       product_id,
       product_category,
       advertiser_id,
       industry,
       row_number()
       over (partition by product_id, product_category,advertiser_id,industry order by 1 desc) ad_rn
       from {ad_info}
    '''
    ad_info = ks.sql(ad_dict_sql, ad_info=ad_info)
    print(ad_info.nunique())
    ad_info.to_csv('../data/ad_info', index=False, num_files=1)
    def X(self, Model):
        """ X for scikit-learn estimators
        """

        input_X_datasets, input_y_datasets, preprocess_X, preprocess_y = (
            Model.get_datasets_format())

        datasets = [
            getattr(self, dataset_name) for dataset_name in input_X_datasets
        ]

        if isinstance(datasets[0], pd.DataFrame) or isinstance(
                datasets[0], pd.Series):
            df = pd.concat(datasets, axis=1)
        else:
            df = kl.concat(datasets, axis=1)

        if preprocess_X is None:
            return df.to_numpy()
        else:
            return preprocess_X(df)
Esempio n. 5
0
    def test_concat(self):
        pdf = pd.DataFrame({'A': [0, 2, 4], 'B': [1, 3, 5]})
        kdf = ks.from_pandas(pdf)

        self.assertRaisesRegex(TypeError, "first argument must be",
                               lambda: ks.concat(kdf))
        self.assertRaisesRegex(TypeError, "cannot concatenate object",
                               lambda: ks.concat([kdf, 1]))

        kdf2 = kdf.set_index('B', append=True)
        self.assertRaisesRegex(ValueError,
                               "Index type and names should be same",
                               lambda: ks.concat([kdf, kdf2]))
        kdf2 = kdf.reset_index()
        self.assertRaisesRegex(ValueError,
                               "Index type and names should be same",
                               lambda: ks.concat([kdf, kdf2]))

        self.assertRaisesRegex(ValueError, "All objects passed",
                               lambda: ks.concat([None, None]))

        self.assertRaisesRegex(ValueError, 'axis should be either 0 or',
                               lambda: ks.concat([kdf, kdf], axis=1))
Esempio n. 6
0
    def test_concat_index_axis(self):
        pdf = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5], "C": [6, 7, 8]})
        # TODO: pdf.columns.names = ["ABC"]
        kdf = ks.from_pandas(pdf)

        ignore_indexes = [True, False]
        joins = ["inner", "outer"]
        sorts = [True, False]

        objs = [
            ([kdf, kdf], [pdf, pdf]),
            ([kdf, kdf.reset_index()], [pdf, pdf.reset_index()]),
            ([kdf.reset_index(), kdf], [pdf.reset_index(), pdf]),
            ([kdf, kdf[["C", "A"]]], [pdf, pdf[["C", "A"]]]),
            ([kdf[["C", "A"]], kdf], [pdf[["C", "A"]], pdf]),
            ([kdf, kdf["C"]], [pdf, pdf["C"]]),
            ([kdf["C"], kdf], [pdf["C"], pdf]),
            ([kdf["C"], kdf, kdf["A"]], [pdf["C"], pdf, pdf["A"]]),
            ([kdf, kdf["C"], kdf["A"]], [pdf, pdf["C"], pdf["A"]]),
        ]

        for ignore_index, join, sort in itertools.product(
                ignore_indexes, joins, sorts):
            for i, (kdfs, pdfs) in enumerate(objs):
                with self.subTest(ignore_index=ignore_index,
                                  join=join,
                                  sort=sort,
                                  pdfs=pdfs,
                                  pair=i):
                    self.assert_eq(
                        ks.concat(kdfs,
                                  ignore_index=ignore_index,
                                  join=join,
                                  sort=sort),
                        pd.concat(pdfs,
                                  ignore_index=ignore_index,
                                  join=join,
                                  sort=sort),
                        almost=(join == "outer"),
                    )

        self.assertRaisesRegex(TypeError, "first argument must be",
                               lambda: ks.concat(kdf))
        self.assertRaisesRegex(TypeError, "cannot concatenate object",
                               lambda: ks.concat([kdf, 1]))

        kdf2 = kdf.set_index("B", append=True)
        self.assertRaisesRegex(ValueError,
                               "Index type and names should be same",
                               lambda: ks.concat([kdf, kdf2]))

        self.assertRaisesRegex(ValueError, "No objects to concatenate",
                               lambda: ks.concat([]))

        self.assertRaisesRegex(ValueError, "All objects passed",
                               lambda: ks.concat([None, None]))

        pdf3 = pdf.copy()
        kdf3 = kdf.copy()

        columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"),
                                             ("Y", "C")])
        # TODO: colums.names = ["XYZ", "ABC"]
        pdf3.columns = columns
        kdf3.columns = columns

        objs = [
            ([kdf3, kdf3], [pdf3, pdf3]),
            ([kdf3, kdf3.reset_index()], [pdf3, pdf3.reset_index()]),
            ([kdf3.reset_index(), kdf3], [pdf3.reset_index(), pdf3]),
            ([kdf3,
              kdf3[[("Y", "C"),
                    ("X", "A")]]], [pdf3, pdf3[[("Y", "C"), ("X", "A")]]]),
            ([kdf3[[("Y", "C"), ("X", "A")]],
              kdf3], [pdf3[[("Y", "C"), ("X", "A")]], pdf3]),
        ]

        for ignore_index, sort in itertools.product(ignore_indexes, sorts):
            for i, (kdfs, pdfs) in enumerate(objs):
                with self.subTest(ignore_index=ignore_index,
                                  join="outer",
                                  sort=sort,
                                  pdfs=pdfs,
                                  pair=i):
                    self.assert_eq(
                        ks.concat(kdfs,
                                  ignore_index=ignore_index,
                                  join="outer",
                                  sort=sort),
                        pd.concat(pdfs,
                                  ignore_index=ignore_index,
                                  join="outer",
                                  sort=sort),
                    )

        # Skip tests for `join="inner" and sort=False` since pandas is flaky.
        for ignore_index in ignore_indexes:
            for i, (kdfs, pdfs) in enumerate(objs):
                with self.subTest(ignore_index=ignore_index,
                                  join="inner",
                                  sort=True,
                                  pdfs=pdfs,
                                  pair=i):
                    self.assert_eq(
                        ks.concat(kdfs,
                                  ignore_index=ignore_index,
                                  join="inner",
                                  sort=True),
                        pd.concat(pdfs,
                                  ignore_index=ignore_index,
                                  join="inner",
                                  sort=True),
                    )

        self.assertRaisesRegex(
            ValueError,
            "MultiIndex columns should have the same levels",
            lambda: ks.concat([kdf, kdf3]),
        )
        self.assertRaisesRegex(
            ValueError,
            "MultiIndex columns should have the same levels",
            lambda: ks.concat([kdf3[("Y", "C")], kdf3]),
        )

        pdf4 = pd.DataFrame({
            "A": [0, 2, 4],
            "B": [1, 3, 5],
            "C": [10, 20, 30]
        })
        kdf4 = ks.from_pandas(pdf4)
        self.assertRaisesRegex(
            ValueError,
            r"Only can inner \(intersect\) or outer \(union\) join the other axis.",
            lambda: ks.concat([kdf, kdf4], join=""),
        )

        self.assertRaisesRegex(
            ValueError,
            r"Only can inner \(intersect\) or outer \(union\) join the other axis.",
            lambda: ks.concat([kdf, kdf4], join="", axis=1),
        )

        self.assertRaisesRegex(
            ValueError,
            r"Only can inner \(intersect\) or outer \(union\) join the other axis.",
            lambda: ks.concat([kdf.A, kdf4.B], join="", axis=1),
        )

        self.assertRaisesRegex(
            ValueError,
            r"Labels have to be unique; however, got duplicated labels \['A'\].",
            lambda: ks.concat([kdf.A, kdf4.A], join="inner", axis=1),
        )
Esempio n. 7
0
    def test_concat(self):
        pdf = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]})
        kdf = ks.from_pandas(pdf)

        self.assert_eq(ks.concat([kdf, kdf.reset_index()]),
                       pd.concat([pdf, pdf.reset_index()]))

        self.assert_eq(
            ks.concat([kdf, kdf[["A"]]], ignore_index=True),
            pd.concat([pdf, pdf[["A"]]], ignore_index=True),
        )

        self.assert_eq(ks.concat([kdf, kdf[["A"]]], join="inner"),
                       pd.concat([pdf, pdf[["A"]]], join="inner"))

        self.assertRaisesRegex(TypeError, "first argument must be",
                               lambda: ks.concat(kdf))
        self.assertRaisesRegex(TypeError, "cannot concatenate object",
                               lambda: ks.concat([kdf, 1]))

        kdf2 = kdf.set_index("B", append=True)
        self.assertRaisesRegex(ValueError,
                               "Index type and names should be same",
                               lambda: ks.concat([kdf, kdf2]))

        self.assertRaisesRegex(ValueError, "No objects to concatenate",
                               lambda: ks.concat([]))

        self.assertRaisesRegex(ValueError, "All objects passed",
                               lambda: ks.concat([None, None]))

        self.assertRaisesRegex(NotImplementedError,
                               "axis should be either 0 or",
                               lambda: ks.concat([kdf, kdf], axis=1))

        pdf3 = pdf.copy()
        kdf3 = kdf.copy()

        columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")])
        pdf3.columns = columns
        kdf3.columns = columns

        self.assert_eq(ks.concat([kdf3, kdf3.reset_index()]),
                       pd.concat([pdf3, pdf3.reset_index()]))

        self.assert_eq(
            ks.concat([kdf3, kdf3[[("X", "A")]]], ignore_index=True),
            pd.concat([pdf3, pdf3[[("X", "A")]]], ignore_index=True),
        )

        self.assert_eq(
            ks.concat([kdf3, kdf3[[("X", "A")]]], join="inner"),
            pd.concat([pdf3, pdf3[[("X", "A")]]], join="inner"),
        )

        self.assertRaisesRegex(
            ValueError,
            "MultiIndex columns should have the same levels",
            lambda: ks.concat([kdf, kdf3]),
        )

        pdf4 = pd.DataFrame({
            "A": [0, 2, 4],
            "B": [1, 3, 5],
            "C": [10, 20, 30]
        })
        kdf4 = ks.from_pandas(pdf4)
        self.assertRaisesRegex(
            ValueError,
            r"Only can inner \(intersect\) or outer \(union\) join the other axis.",
            lambda: ks.concat([kdf, kdf4], join=""),
        )
Esempio n. 8
0
    def test_concat_column_axis(self):
        pdf1 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3])
        pdf1.columns.names = ["AB"]
        pdf2 = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]}, index=[1, 3, 5])
        pdf2.columns.names = ["CD"]
        kdf1 = ks.from_pandas(pdf1)
        kdf2 = ks.from_pandas(pdf2)

        kdf3 = kdf1.copy()
        kdf4 = kdf2.copy()
        pdf3 = pdf1.copy()
        pdf4 = pdf2.copy()

        columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")],
                                            names=["X", "AB"])
        pdf3.columns = columns
        kdf3.columns = columns

        columns = pd.MultiIndex.from_tuples([("X", "C"), ("X", "D")],
                                            names=["Y", "CD"])
        pdf4.columns = columns
        kdf4.columns = columns

        ignore_indexes = [True, False]
        joins = ["inner", "outer"]

        objs = [
            ([kdf1.A, kdf1.A.rename("B")], [pdf1.A, pdf1.A.rename("B")]),
            (
                [kdf3[("X", "A")], kdf3[("X", "B")]],
                [pdf3[("X", "A")], pdf3[("X", "B")]],
            ),
            (
                [kdf3[("X", "A")], kdf3[("X", "B")].rename("ABC")],
                [pdf3[("X", "A")], pdf3[("X", "B")].rename("ABC")],
            ),
            (
                [kdf3[("X", "A")].rename("ABC"), kdf3[("X", "B")]],
                [pdf3[("X", "A")].rename("ABC"), pdf3[("X", "B")]],
            ),
        ]

        for ignore_index, join in itertools.product(ignore_indexes, joins):
            for i, (kdfs, pdfs) in enumerate(objs):
                with self.subTest(ignore_index=ignore_index,
                                  join=join,
                                  pdfs=pdfs,
                                  pair=i):
                    actual = ks.concat(kdfs,
                                       axis=1,
                                       ignore_index=ignore_index,
                                       join=join)
                    expected = pd.concat(pdfs,
                                         axis=1,
                                         ignore_index=ignore_index,
                                         join=join)
                    self.assert_eq(
                        repr(
                            actual.sort_values(list(
                                actual.columns)).reset_index(drop=True)),
                        repr(
                            expected.sort_values(list(
                                expected.columns)).reset_index(drop=True)),
                    )
Esempio n. 9
0
    def test_concat(self):
        pdf = pd.DataFrame({'A': [0, 2, 4], 'B': [1, 3, 5]})
        kdf = ks.from_pandas(pdf)

        self.assert_eq(
            ks.concat([kdf, kdf.reset_index()]),
            pd.concat([pdf, pdf.reset_index()]))

        self.assert_eq(
            ks.concat([kdf, kdf[['A']]], ignore_index=True),
            pd.concat([pdf, pdf[['A']]], ignore_index=True))

        self.assert_eq(
            ks.concat([kdf, kdf[['A']]], join="inner"),
            pd.concat([pdf, pdf[['A']]], join="inner"))

        self.assertRaisesRegex(TypeError, "first argument must be", lambda: ks.concat(kdf))
        self.assertRaisesRegex(
            TypeError, "cannot concatenate object", lambda: ks.concat([kdf, 1]))

        kdf2 = kdf.set_index('B', append=True)
        self.assertRaisesRegex(
            ValueError, "Index type and names should be same", lambda: ks.concat([kdf, kdf2]))

        self.assertRaisesRegex(ValueError, "No objects to concatenate", lambda: ks.concat([]))

        self.assertRaisesRegex(
            ValueError, "All objects passed", lambda: ks.concat([None, None]))

        self.assertRaisesRegex(
            ValueError, 'axis should be either 0 or', lambda: ks.concat([kdf, kdf], axis=1))

        pdf3 = pdf.copy()
        kdf3 = kdf.copy()

        columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B')])
        pdf3.columns = columns
        kdf3.columns = columns

        self.assert_eq(ks.concat([kdf3, kdf3.reset_index()]),
                       pd.concat([pdf3, pdf3.reset_index()]))

        self.assert_eq(
            ks.concat([kdf3, kdf3[[('X', 'A')]]], ignore_index=True),
            pd.concat([pdf3, pdf3[[('X', 'A')]]], ignore_index=True))

        self.assert_eq(
            ks.concat([kdf3, kdf3[[('X', 'A')]]], join="inner"),
            pd.concat([pdf3, pdf3[[('X', 'A')]]], join="inner"))

        self.assertRaisesRegex(ValueError, "MultiIndex columns should have the same levels",
                               lambda: ks.concat([kdf, kdf3]))
Esempio n. 10
0
stop = datetime.now()

print("Temps préparation et inférence (ML) : ", (stop - start).seconds, "s")

# %%
##### 7e changement : Il faut donc recalculer le score nous même

from databricks.koalas.config import set_option, reset_option

set_option("compute.ops_on_diff_frames", True)

# Score : The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum()

reel = ks.Series(y_test).to_frame().rename(columns={0: 'Reel'})
result = ks.concat([df, reel], axis=1)

result['square_diff_true_pred'] = (result['Reel'] - result['prediction'])**2
u = result['square_diff_true_pred'].sum()
v = ((result['Reel'] - result['Reel'].mean())**2).sum()

score = (1 - u / v)
print(f"score: {score}")

# %% [markdown]
# ## Entrainement et inférence avec Pipeline
# %% [markdown]
# Seuls les modèles entrainés et les prédictions peuvent être utilisés avec koalas

# %%