def test_concat_column_axis(self): pdf1 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3]) pdf2 = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]}, index=[1, 3, 5]) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) kdf3 = kdf1.copy() kdf4 = kdf2.copy() pdf3 = pdf1.copy() pdf4 = pdf2.copy() columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) pdf3.columns = columns kdf3.columns = columns columns = pd.MultiIndex.from_tuples([("X", "C"), ("X", "D")]) pdf4.columns = columns kdf4.columns = columns pdf5 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3]) pdf6 = pd.DataFrame({"C": [1, 2, 3]}, index=[1, 3, 5]) kdf5 = ks.from_pandas(pdf5) kdf6 = ks.from_pandas(pdf6) ignore_indexes = [True, False] joins = ["inner", "outer"] objs = [ ([kdf1.A, kdf2.C], [pdf1.A, pdf2.C]), ([kdf1, kdf2.C], [pdf1, pdf2.C]), ([kdf1.A, kdf2], [pdf1.A, pdf2]), ([kdf1.A, kdf2.C], [pdf1.A, pdf2.C]), ([kdf1.A, kdf1.A.rename("B")], [pdf1.A, pdf1.A.rename("B")]), ([kdf3[("X", "A")], kdf4[("X", "C")]], [pdf3[("X", "A")], pdf4[("X", "C")]]), ([kdf3, kdf4[("X", "C")]], [pdf3, pdf4[("X", "C")]]), ([kdf3[("X", "A")], kdf4], [pdf3[("X", "A")], pdf4]), ([kdf3, kdf4], [pdf3, pdf4]), ([kdf3[("X", "A")], kdf3[("X", "B")]], [pdf3[("X", "A")], pdf3[("X", "B")]],), ( [kdf3[("X", "A")], kdf3[("X", "B")].rename("ABC")], [pdf3[("X", "A")], pdf3[("X", "B")].rename("ABC")], ), ( [kdf3[("X", "A")].rename("ABC"), kdf3[("X", "B")]], [pdf3[("X", "A")].rename("ABC"), pdf3[("X", "B")]], ), ([kdf5, kdf6], [pdf5, pdf6]), ([kdf6, kdf5], [pdf6, pdf5]), ] for ignore_index, join in itertools.product(ignore_indexes, joins): for obj in objs: kdfs, pdfs = obj with self.subTest(ignore_index=ignore_index, join=join, objs=pdfs): actual = ks.concat(kdfs, axis=1, ignore_index=ignore_index, join=join) expected = pd.concat(pdfs, axis=1, ignore_index=ignore_index, join=join) self.assert_eq( repr(actual.sort_values(list(actual.columns)).reset_index(drop=True)), repr(expected.sort_values(list(expected.columns)).reset_index(drop=True)), )
def y(self, Model): """ y for scikit-learn estimators """ input_X_datasets, input_y_datasets, preprocess_X, preprocess_y = ( Model.get_datasets_format()) if input_y_datasets is None: datasets = None else: datasets = [ getattr(self, dataset_name) for dataset_name in input_y_datasets ] if isinstance(self.claim_count, pd.DataFrame): return pd.concat(datasets, axis=1).to_numpy() else: return kl.concat(datasets, axis=1).to_numpy()
def get_ad_dict(): train_ad = ks.read_csv("../data/train_preliminary/ad.csv") test_ad = ks.read_csv("../data/test/ad.csv") ad_info = ks.concat([train_ad, test_ad], axis=0) ad_info = ad_info.drop_duplicates() ad_dict_sql = ''' select creative_id, product_id, product_category, advertiser_id, industry, row_number() over (partition by product_id, product_category,advertiser_id,industry order by 1 desc) ad_rn from {ad_info} ''' ad_info = ks.sql(ad_dict_sql, ad_info=ad_info) print(ad_info.nunique()) ad_info.to_csv('../data/ad_info', index=False, num_files=1)
def X(self, Model): """ X for scikit-learn estimators """ input_X_datasets, input_y_datasets, preprocess_X, preprocess_y = ( Model.get_datasets_format()) datasets = [ getattr(self, dataset_name) for dataset_name in input_X_datasets ] if isinstance(datasets[0], pd.DataFrame) or isinstance( datasets[0], pd.Series): df = pd.concat(datasets, axis=1) else: df = kl.concat(datasets, axis=1) if preprocess_X is None: return df.to_numpy() else: return preprocess_X(df)
def test_concat(self): pdf = pd.DataFrame({'A': [0, 2, 4], 'B': [1, 3, 5]}) kdf = ks.from_pandas(pdf) self.assertRaisesRegex(TypeError, "first argument must be", lambda: ks.concat(kdf)) self.assertRaisesRegex(TypeError, "cannot concatenate object", lambda: ks.concat([kdf, 1])) kdf2 = kdf.set_index('B', append=True) self.assertRaisesRegex(ValueError, "Index type and names should be same", lambda: ks.concat([kdf, kdf2])) kdf2 = kdf.reset_index() self.assertRaisesRegex(ValueError, "Index type and names should be same", lambda: ks.concat([kdf, kdf2])) self.assertRaisesRegex(ValueError, "All objects passed", lambda: ks.concat([None, None])) self.assertRaisesRegex(ValueError, 'axis should be either 0 or', lambda: ks.concat([kdf, kdf], axis=1))
def test_concat_index_axis(self): pdf = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5], "C": [6, 7, 8]}) # TODO: pdf.columns.names = ["ABC"] kdf = ks.from_pandas(pdf) ignore_indexes = [True, False] joins = ["inner", "outer"] sorts = [True, False] objs = [ ([kdf, kdf], [pdf, pdf]), ([kdf, kdf.reset_index()], [pdf, pdf.reset_index()]), ([kdf.reset_index(), kdf], [pdf.reset_index(), pdf]), ([kdf, kdf[["C", "A"]]], [pdf, pdf[["C", "A"]]]), ([kdf[["C", "A"]], kdf], [pdf[["C", "A"]], pdf]), ([kdf, kdf["C"]], [pdf, pdf["C"]]), ([kdf["C"], kdf], [pdf["C"], pdf]), ([kdf["C"], kdf, kdf["A"]], [pdf["C"], pdf, pdf["A"]]), ([kdf, kdf["C"], kdf["A"]], [pdf, pdf["C"], pdf["A"]]), ] for ignore_index, join, sort in itertools.product( ignore_indexes, joins, sorts): for i, (kdfs, pdfs) in enumerate(objs): with self.subTest(ignore_index=ignore_index, join=join, sort=sort, pdfs=pdfs, pair=i): self.assert_eq( ks.concat(kdfs, ignore_index=ignore_index, join=join, sort=sort), pd.concat(pdfs, ignore_index=ignore_index, join=join, sort=sort), almost=(join == "outer"), ) self.assertRaisesRegex(TypeError, "first argument must be", lambda: ks.concat(kdf)) self.assertRaisesRegex(TypeError, "cannot concatenate object", lambda: ks.concat([kdf, 1])) kdf2 = kdf.set_index("B", append=True) self.assertRaisesRegex(ValueError, "Index type and names should be same", lambda: ks.concat([kdf, kdf2])) self.assertRaisesRegex(ValueError, "No objects to concatenate", lambda: ks.concat([])) self.assertRaisesRegex(ValueError, "All objects passed", lambda: ks.concat([None, None])) pdf3 = pdf.copy() kdf3 = kdf.copy() columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")]) # TODO: colums.names = ["XYZ", "ABC"] pdf3.columns = columns kdf3.columns = columns objs = [ ([kdf3, kdf3], [pdf3, pdf3]), ([kdf3, kdf3.reset_index()], [pdf3, pdf3.reset_index()]), ([kdf3.reset_index(), kdf3], [pdf3.reset_index(), pdf3]), ([kdf3, kdf3[[("Y", "C"), ("X", "A")]]], [pdf3, pdf3[[("Y", "C"), ("X", "A")]]]), ([kdf3[[("Y", "C"), ("X", "A")]], kdf3], [pdf3[[("Y", "C"), ("X", "A")]], pdf3]), ] for ignore_index, sort in itertools.product(ignore_indexes, sorts): for i, (kdfs, pdfs) in enumerate(objs): with self.subTest(ignore_index=ignore_index, join="outer", sort=sort, pdfs=pdfs, pair=i): self.assert_eq( ks.concat(kdfs, ignore_index=ignore_index, join="outer", sort=sort), pd.concat(pdfs, ignore_index=ignore_index, join="outer", sort=sort), ) # Skip tests for `join="inner" and sort=False` since pandas is flaky. for ignore_index in ignore_indexes: for i, (kdfs, pdfs) in enumerate(objs): with self.subTest(ignore_index=ignore_index, join="inner", sort=True, pdfs=pdfs, pair=i): self.assert_eq( ks.concat(kdfs, ignore_index=ignore_index, join="inner", sort=True), pd.concat(pdfs, ignore_index=ignore_index, join="inner", sort=True), ) self.assertRaisesRegex( ValueError, "MultiIndex columns should have the same levels", lambda: ks.concat([kdf, kdf3]), ) self.assertRaisesRegex( ValueError, "MultiIndex columns should have the same levels", lambda: ks.concat([kdf3[("Y", "C")], kdf3]), ) pdf4 = pd.DataFrame({ "A": [0, 2, 4], "B": [1, 3, 5], "C": [10, 20, 30] }) kdf4 = ks.from_pandas(pdf4) self.assertRaisesRegex( ValueError, r"Only can inner \(intersect\) or outer \(union\) join the other axis.", lambda: ks.concat([kdf, kdf4], join=""), ) self.assertRaisesRegex( ValueError, r"Only can inner \(intersect\) or outer \(union\) join the other axis.", lambda: ks.concat([kdf, kdf4], join="", axis=1), ) self.assertRaisesRegex( ValueError, r"Only can inner \(intersect\) or outer \(union\) join the other axis.", lambda: ks.concat([kdf.A, kdf4.B], join="", axis=1), ) self.assertRaisesRegex( ValueError, r"Labels have to be unique; however, got duplicated labels \['A'\].", lambda: ks.concat([kdf.A, kdf4.A], join="inner", axis=1), )
def test_concat(self): pdf = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}) kdf = ks.from_pandas(pdf) self.assert_eq(ks.concat([kdf, kdf.reset_index()]), pd.concat([pdf, pdf.reset_index()])) self.assert_eq( ks.concat([kdf, kdf[["A"]]], ignore_index=True), pd.concat([pdf, pdf[["A"]]], ignore_index=True), ) self.assert_eq(ks.concat([kdf, kdf[["A"]]], join="inner"), pd.concat([pdf, pdf[["A"]]], join="inner")) self.assertRaisesRegex(TypeError, "first argument must be", lambda: ks.concat(kdf)) self.assertRaisesRegex(TypeError, "cannot concatenate object", lambda: ks.concat([kdf, 1])) kdf2 = kdf.set_index("B", append=True) self.assertRaisesRegex(ValueError, "Index type and names should be same", lambda: ks.concat([kdf, kdf2])) self.assertRaisesRegex(ValueError, "No objects to concatenate", lambda: ks.concat([])) self.assertRaisesRegex(ValueError, "All objects passed", lambda: ks.concat([None, None])) self.assertRaisesRegex(NotImplementedError, "axis should be either 0 or", lambda: ks.concat([kdf, kdf], axis=1)) pdf3 = pdf.copy() kdf3 = kdf.copy() columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) pdf3.columns = columns kdf3.columns = columns self.assert_eq(ks.concat([kdf3, kdf3.reset_index()]), pd.concat([pdf3, pdf3.reset_index()])) self.assert_eq( ks.concat([kdf3, kdf3[[("X", "A")]]], ignore_index=True), pd.concat([pdf3, pdf3[[("X", "A")]]], ignore_index=True), ) self.assert_eq( ks.concat([kdf3, kdf3[[("X", "A")]]], join="inner"), pd.concat([pdf3, pdf3[[("X", "A")]]], join="inner"), ) self.assertRaisesRegex( ValueError, "MultiIndex columns should have the same levels", lambda: ks.concat([kdf, kdf3]), ) pdf4 = pd.DataFrame({ "A": [0, 2, 4], "B": [1, 3, 5], "C": [10, 20, 30] }) kdf4 = ks.from_pandas(pdf4) self.assertRaisesRegex( ValueError, r"Only can inner \(intersect\) or outer \(union\) join the other axis.", lambda: ks.concat([kdf, kdf4], join=""), )
def test_concat_column_axis(self): pdf1 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3]) pdf1.columns.names = ["AB"] pdf2 = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]}, index=[1, 3, 5]) pdf2.columns.names = ["CD"] kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) kdf3 = kdf1.copy() kdf4 = kdf2.copy() pdf3 = pdf1.copy() pdf4 = pdf2.copy() columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")], names=["X", "AB"]) pdf3.columns = columns kdf3.columns = columns columns = pd.MultiIndex.from_tuples([("X", "C"), ("X", "D")], names=["Y", "CD"]) pdf4.columns = columns kdf4.columns = columns ignore_indexes = [True, False] joins = ["inner", "outer"] objs = [ ([kdf1.A, kdf1.A.rename("B")], [pdf1.A, pdf1.A.rename("B")]), ( [kdf3[("X", "A")], kdf3[("X", "B")]], [pdf3[("X", "A")], pdf3[("X", "B")]], ), ( [kdf3[("X", "A")], kdf3[("X", "B")].rename("ABC")], [pdf3[("X", "A")], pdf3[("X", "B")].rename("ABC")], ), ( [kdf3[("X", "A")].rename("ABC"), kdf3[("X", "B")]], [pdf3[("X", "A")].rename("ABC"), pdf3[("X", "B")]], ), ] for ignore_index, join in itertools.product(ignore_indexes, joins): for i, (kdfs, pdfs) in enumerate(objs): with self.subTest(ignore_index=ignore_index, join=join, pdfs=pdfs, pair=i): actual = ks.concat(kdfs, axis=1, ignore_index=ignore_index, join=join) expected = pd.concat(pdfs, axis=1, ignore_index=ignore_index, join=join) self.assert_eq( repr( actual.sort_values(list( actual.columns)).reset_index(drop=True)), repr( expected.sort_values(list( expected.columns)).reset_index(drop=True)), )
def test_concat(self): pdf = pd.DataFrame({'A': [0, 2, 4], 'B': [1, 3, 5]}) kdf = ks.from_pandas(pdf) self.assert_eq( ks.concat([kdf, kdf.reset_index()]), pd.concat([pdf, pdf.reset_index()])) self.assert_eq( ks.concat([kdf, kdf[['A']]], ignore_index=True), pd.concat([pdf, pdf[['A']]], ignore_index=True)) self.assert_eq( ks.concat([kdf, kdf[['A']]], join="inner"), pd.concat([pdf, pdf[['A']]], join="inner")) self.assertRaisesRegex(TypeError, "first argument must be", lambda: ks.concat(kdf)) self.assertRaisesRegex( TypeError, "cannot concatenate object", lambda: ks.concat([kdf, 1])) kdf2 = kdf.set_index('B', append=True) self.assertRaisesRegex( ValueError, "Index type and names should be same", lambda: ks.concat([kdf, kdf2])) self.assertRaisesRegex(ValueError, "No objects to concatenate", lambda: ks.concat([])) self.assertRaisesRegex( ValueError, "All objects passed", lambda: ks.concat([None, None])) self.assertRaisesRegex( ValueError, 'axis should be either 0 or', lambda: ks.concat([kdf, kdf], axis=1)) pdf3 = pdf.copy() kdf3 = kdf.copy() columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B')]) pdf3.columns = columns kdf3.columns = columns self.assert_eq(ks.concat([kdf3, kdf3.reset_index()]), pd.concat([pdf3, pdf3.reset_index()])) self.assert_eq( ks.concat([kdf3, kdf3[[('X', 'A')]]], ignore_index=True), pd.concat([pdf3, pdf3[[('X', 'A')]]], ignore_index=True)) self.assert_eq( ks.concat([kdf3, kdf3[[('X', 'A')]]], join="inner"), pd.concat([pdf3, pdf3[[('X', 'A')]]], join="inner")) self.assertRaisesRegex(ValueError, "MultiIndex columns should have the same levels", lambda: ks.concat([kdf, kdf3]))
stop = datetime.now() print("Temps préparation et inférence (ML) : ", (stop - start).seconds, "s") # %% ##### 7e changement : Il faut donc recalculer le score nous même from databricks.koalas.config import set_option, reset_option set_option("compute.ops_on_diff_frames", True) # Score : The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum() reel = ks.Series(y_test).to_frame().rename(columns={0: 'Reel'}) result = ks.concat([df, reel], axis=1) result['square_diff_true_pred'] = (result['Reel'] - result['prediction'])**2 u = result['square_diff_true_pred'].sum() v = ((result['Reel'] - result['Reel'].mean())**2).sum() score = (1 - u / v) print(f"score: {score}") # %% [markdown] # ## Entrainement et inférence avec Pipeline # %% [markdown] # Seuls les modèles entrainés et les prédictions peuvent être utilisés avec koalas # %%