def test_benchmark_pandas(self): imdb = fetch_imdb_dataset() start_time = time.time() trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id], join_type="inner") transformed_df = trainable.transform(imdb) trainable = Filter(pred=[it["director_id"] == 8]) filtered_df = trainable.transform(transformed_df) self.assertEqual(filtered_df.shape, (35, 6)) join_first = time.time() - start_time logger.info( " Pandas: Join Before Filter --- {} seconds".format(join_first)) movies_directors = imdb[4] self.assertEqual(get_table_name(movies_directors), "movies_directors") start_time = time.time() trainable = Filter(pred=[it["director_id"] == 8]) filtered_df = trainable.transform(movies_directors) self.assertEqual(get_table_name(filtered_df), "movies_directors") imdb.pop(4) imdb.append(filtered_df) trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id], join_type="inner") transformed_df = trainable.transform(imdb) self.assertEqual(transformed_df.shape, (35, 6)) filter_first = time.time() - start_time logger.info( " Pandas: Join After Filter --- {} seconds".format(filter_first))
def test_benchmark_join_before_filter_spark(self): if spark_installed: imdb = fetch_imdb_dataset("spark") start_time = time.time() trainable = Join( pred=[it.movies_directors.movie_id == it.movies.id], join_type="inner") transformed_df = trainable.transform(imdb) trainable = Filter(pred=[it["director_id"] == 8]) filtered_df = trainable.transform(transformed_df) self.assertEqual(filtered_df.count(), 35) self.assertEqual(len(filtered_df.columns), 6) return time.time() - start_time
def test_join_pandas_imdb(self): imdb = fetch_imdb_dataset() trainable = Join( pred=[ it.movies_directors.movie_id == it.movies_genres.movie_id, it.movies_genres.movie_id == it.movies.id, it.movies_directors.movie_id == it.roles.movie_id, ], join_type="left", ) transformed_df = trainable.transform(imdb) self.assertEqual(transformed_df.shape, (6062848, 9)) self.assertEqual(transformed_df["movie_id"][1], 281325)
def test_join_spark_imdb(self): if spark_installed: imdb = fetch_imdb_dataset("spark") trainable = Join( pred=[ it.movies_directors.movie_id == it.movies_genres.movie_id, it.movies_genres.movie_id == it.movies.id, it.movies_directors.movie_id == it.roles.movie_id, ], join_type="left", ) transformed_df = trainable.transform(imdb) self.assertEqual(transformed_df.count(), 6062848) self.assertEqual(len(transformed_df.columns), 9)
def test_fit_error(self): relational = Relational(operator=( Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[ it.main.TrainId == it.delay.TrainId, it.main["Arrival time"] >= it.delay.TimeStamp, ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId)) with self.assertRaises(ValueError): _ = relational.fit([self.X_train], self.y_train)
def test_fit_transform(self): relational = Relational(operator=( Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[ it.main.TrainId == it.delay.TrainId, it.main["Arrival time"] >= it.delay.TimeStamp, ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId)) trained_relational = relational.fit(self.X_train, self.y_train) _ = trained_relational.transform(self.X_test)
def test_fit_transform_in_pipeline(self): relational = Relational(operator=( Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[ it.main.TrainId == it.delay.TrainId, it.main["Arrival time"] >= it.delay.TimeStamp, ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId)) pipeline = relational >> LogisticRegression() trained_pipeline = pipeline.fit(self.X_train, self.y_train) _ = trained_pipeline.predict(self.X_test)
def test_benchmark_join_after_filter_spark(self): if spark_installed: imdb = fetch_imdb_dataset("spark") movies_directors = imdb[4] self.assertEqual(get_table_name(movies_directors), "movies_directors") start_time = time.time() trainable = Filter(pred=[it["director_id"] == 8]) filtered_df = trainable.transform(movies_directors) self.assertEqual(get_table_name(filtered_df), "movies_directors") imdb.pop(4) imdb.append(filtered_df) trainable = Join( pred=[it.movies_directors.movie_id == it.movies.id], join_type="inner") transformed_df = trainable.transform(imdb) self.assertEqual(transformed_df.count(), 35) self.assertEqual(len(transformed_df.columns), 6) return time.time() - start_time
def test_expression(self): from lale.expressions import it, mean from lale.lib.lale import Aggregate, Join, Scan scan1 = Scan(table=it["table1.csv"]) scan2 = Scan(table=it["table2.csv"]) join = Join(pred=(it["table1.csv"].k1 == it["table2.csv"].k2)) aggregate = Aggregate(columns={"talk_time|mean": mean(it.talk_time)}) pipeline = (scan1 & scan2) >> join >> aggregate expected = """from lale.lib.lale import Scan from lale.expressions import it from lale.lib.lale import Join from lale.lib.lale import Aggregate from lale.expressions import mean import lale lale.wrap_imported_operators() scan_0 = Scan(table=it["table1.csv"]) scan_1 = Scan(table=it["table2.csv"]) join = Join(pred=(it["table1.csv"].k1 == it["table2.csv"].k2)) aggregate = Aggregate(columns={"talk_time|mean": mean(it.talk_time)}) pipeline = (scan_0 & scan_1) >> join >> aggregate""" self._roundtrip(expected, lale.pretty_print.to_string(pipeline))
def test_with_hyperopt2(self): from lale.expressions import ( count, it, max, mean, min, string_indexer, sum, variance, ) wrap_imported_operators() scan = Scan(table=it["main"]) scan_0 = Scan(table=it["customers"]) join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"] ["group_customer_id"])]) map = Map( columns={ "[main](group_customer_id)[customers]|number_children|identity": it["number_children"], "[main](group_customer_id)[customers]|name|identity": it["name"], "[main](group_customer_id)[customers]|income|identity": it["income"], "[main](group_customer_id)[customers]|address|identity": it["address"], "[main](group_customer_id)[customers]|age|identity": it["age"], }, remainder="drop", ) pipeline_4 = join >> map scan_1 = Scan(table=it["purchase"]) join_0 = Join( pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])], join_limit=50.0, ) aggregate = Aggregate( columns={ "[main](group_id)[purchase]|price|variance": variance(it["price"]), "[main](group_id)[purchase]|time|sum": sum(it["time"]), "[main](group_id)[purchase]|time|mean": mean(it["time"]), "[main](group_id)[purchase]|time|min": min(it["time"]), "[main](group_id)[purchase]|price|sum": sum(it["price"]), "[main](group_id)[purchase]|price|count": count(it["price"]), "[main](group_id)[purchase]|price|mean": mean(it["price"]), "[main](group_id)[purchase]|price|min": min(it["price"]), "[main](group_id)[purchase]|price|max": max(it["price"]), "[main](group_id)[purchase]|time|max": max(it["time"]), "[main](group_id)[purchase]|time|variance": variance(it["time"]), }, group_by=it["row_id"], ) pipeline_5 = join_0 >> aggregate map_0 = Map( columns={ "[main]|group_customer_id|identity": it["group_customer_id"], "[main]|transaction_id|identity": it["transaction_id"], "[main]|group_id|identity": it["group_id"], "[main]|comments|identity": it["comments"], "[main]|id|identity": it["id"], "prefix_0_id": it["prefix_0_id"], "next_purchase": it["next_purchase"], "[main]|time|identity": it["time"], }, remainder="drop", ) scan_2 = Scan(table=it["transactions"]) scan_3 = Scan(table=it["products"]) join_1 = Join(pred=[ (it["main"]["transaction_id"] == it["transactions"] ["transaction_id"]), (it["transactions"]["product_id"] == it["products"]["product_id"]), ]) map_1 = Map( columns={ "[main](transaction_id)[transactions](product_id)[products]|price|identity": it["price"], "[main](transaction_id)[transactions](product_id)[products]|type|identity": it["type"], }, remainder="drop", ) pipeline_6 = join_1 >> map_1 join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"] ["transaction_id"])]) map_2 = Map( columns={ "[main](transaction_id)[transactions]|description|identity": it["description"], "[main](transaction_id)[transactions]|product_id|identity": it["product_id"], }, remainder="drop", ) pipeline_7 = join_2 >> map_2 map_3 = Map(columns=[ string_indexer(it["[main]|comments|identity"]), string_indexer( it["[main](transaction_id)[transactions]|description|identity"] ), string_indexer(it[ "[main](transaction_id)[transactions](product_id)[products]|type|identity"] ), string_indexer( it["[main](group_customer_id)[customers]|name|identity"]), string_indexer( it["[main](group_customer_id)[customers]|address|identity"]), ]) pipeline_8 = ConcatFeatures() >> map_3 relational = Relational(operator=make_pipeline_graph( steps=[ scan, scan_0, pipeline_4, scan_1, pipeline_5, map_0, scan_2, scan_3, pipeline_6, pipeline_7, pipeline_8, ], edges=[ (scan, pipeline_4), (scan, pipeline_5), (scan, map_0), (scan, pipeline_6), (scan, pipeline_7), (scan_0, pipeline_4), (pipeline_4, pipeline_8), (scan_1, pipeline_5), (pipeline_5, pipeline_8), (map_0, pipeline_8), (scan_2, pipeline_6), (scan_2, pipeline_7), (scan_3, pipeline_6), (pipeline_6, pipeline_8), (pipeline_7, pipeline_8), ], )) pipeline = relational >> (KNeighborsClassifier | LogisticRegression) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) from lale.lib.lale import Hyperopt opt = Hyperopt(estimator=pipeline, max_evals=2) opt.fit(X, y)
def test_init(self): _ = Join(pred=[it.main.train_id == it.info.TrainId], join_type="inner")