Beispiel #1
0
    def test_benchmark_pandas(self):
        imdb = fetch_imdb_dataset()

        start_time = time.time()
        trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id],
                         join_type="inner")
        transformed_df = trainable.transform(imdb)
        trainable = Filter(pred=[it["director_id"] == 8])
        filtered_df = trainable.transform(transformed_df)
        self.assertEqual(filtered_df.shape, (35, 6))
        join_first = time.time() - start_time
        logger.info(
            " Pandas: Join Before Filter --- {} seconds".format(join_first))

        movies_directors = imdb[4]
        self.assertEqual(get_table_name(movies_directors), "movies_directors")
        start_time = time.time()
        trainable = Filter(pred=[it["director_id"] == 8])
        filtered_df = trainable.transform(movies_directors)
        self.assertEqual(get_table_name(filtered_df), "movies_directors")
        imdb.pop(4)
        imdb.append(filtered_df)
        trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id],
                         join_type="inner")
        transformed_df = trainable.transform(imdb)
        self.assertEqual(transformed_df.shape, (35, 6))
        filter_first = time.time() - start_time
        logger.info(
            " Pandas: Join After Filter --- {} seconds".format(filter_first))
Beispiel #2
0
 def test_benchmark_join_before_filter_spark(self):
     if spark_installed:
         imdb = fetch_imdb_dataset("spark")
         start_time = time.time()
         trainable = Join(
             pred=[it.movies_directors.movie_id == it.movies.id],
             join_type="inner")
         transformed_df = trainable.transform(imdb)
         trainable = Filter(pred=[it["director_id"] == 8])
         filtered_df = trainable.transform(transformed_df)
         self.assertEqual(filtered_df.count(), 35)
         self.assertEqual(len(filtered_df.columns), 6)
         return time.time() - start_time
Beispiel #3
0
 def test_join_pandas_imdb(self):
     imdb = fetch_imdb_dataset()
     trainable = Join(
         pred=[
             it.movies_directors.movie_id == it.movies_genres.movie_id,
             it.movies_genres.movie_id == it.movies.id,
             it.movies_directors.movie_id == it.roles.movie_id,
         ],
         join_type="left",
     )
     transformed_df = trainable.transform(imdb)
     self.assertEqual(transformed_df.shape, (6062848, 9))
     self.assertEqual(transformed_df["movie_id"][1], 281325)
Beispiel #4
0
 def test_join_spark_imdb(self):
     if spark_installed:
         imdb = fetch_imdb_dataset("spark")
         trainable = Join(
             pred=[
                 it.movies_directors.movie_id == it.movies_genres.movie_id,
                 it.movies_genres.movie_id == it.movies.id,
                 it.movies_directors.movie_id == it.roles.movie_id,
             ],
             join_type="left",
         )
         transformed_df = trainable.transform(imdb)
         self.assertEqual(transformed_df.count(), 6062848)
         self.assertEqual(len(transformed_df.columns), 9)
Beispiel #5
0
 def test_fit_error(self):
     relational = Relational(operator=(
         Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[
             it.main.TrainId == it.delay.TrainId,
             it.main["Arrival time"] >= it.delay.TimeStamp,
         ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId))
     with self.assertRaises(ValueError):
         _ = relational.fit([self.X_train], self.y_train)
Beispiel #6
0
 def test_fit_transform(self):
     relational = Relational(operator=(
         Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[
             it.main.TrainId == it.delay.TrainId,
             it.main["Arrival time"] >= it.delay.TimeStamp,
         ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId))
     trained_relational = relational.fit(self.X_train, self.y_train)
     _ = trained_relational.transform(self.X_test)
Beispiel #7
0
 def test_fit_transform_in_pipeline(self):
     relational = Relational(operator=(
         Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[
             it.main.TrainId == it.delay.TrainId,
             it.main["Arrival time"] >= it.delay.TimeStamp,
         ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId))
     pipeline = relational >> LogisticRegression()
     trained_pipeline = pipeline.fit(self.X_train, self.y_train)
     _ = trained_pipeline.predict(self.X_test)
Beispiel #8
0
 def test_benchmark_join_after_filter_spark(self):
     if spark_installed:
         imdb = fetch_imdb_dataset("spark")
         movies_directors = imdb[4]
         self.assertEqual(get_table_name(movies_directors),
                          "movies_directors")
         start_time = time.time()
         trainable = Filter(pred=[it["director_id"] == 8])
         filtered_df = trainable.transform(movies_directors)
         self.assertEqual(get_table_name(filtered_df), "movies_directors")
         imdb.pop(4)
         imdb.append(filtered_df)
         trainable = Join(
             pred=[it.movies_directors.movie_id == it.movies.id],
             join_type="inner")
         transformed_df = trainable.transform(imdb)
         self.assertEqual(transformed_df.count(), 35)
         self.assertEqual(len(transformed_df.columns), 6)
         return time.time() - start_time
    def test_expression(self):
        from lale.expressions import it, mean
        from lale.lib.lale import Aggregate, Join, Scan

        scan1 = Scan(table=it["table1.csv"])
        scan2 = Scan(table=it["table2.csv"])
        join = Join(pred=(it["table1.csv"].k1 == it["table2.csv"].k2))
        aggregate = Aggregate(columns={"talk_time|mean": mean(it.talk_time)})
        pipeline = (scan1 & scan2) >> join >> aggregate
        expected = """from lale.lib.lale import Scan
from lale.expressions import it
from lale.lib.lale import Join
from lale.lib.lale import Aggregate
from lale.expressions import mean
import lale

lale.wrap_imported_operators()
scan_0 = Scan(table=it["table1.csv"])
scan_1 = Scan(table=it["table2.csv"])
join = Join(pred=(it["table1.csv"].k1 == it["table2.csv"].k2))
aggregate = Aggregate(columns={"talk_time|mean": mean(it.talk_time)})
pipeline = (scan_0 & scan_1) >> join >> aggregate"""
        self._roundtrip(expected, lale.pretty_print.to_string(pipeline))
Beispiel #10
0
    def test_with_hyperopt2(self):
        from lale.expressions import (
            count,
            it,
            max,
            mean,
            min,
            string_indexer,
            sum,
            variance,
        )

        wrap_imported_operators()
        scan = Scan(table=it["main"])
        scan_0 = Scan(table=it["customers"])
        join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"]
                           ["group_customer_id"])])
        map = Map(
            columns={
                "[main](group_customer_id)[customers]|number_children|identity":
                it["number_children"],
                "[main](group_customer_id)[customers]|name|identity":
                it["name"],
                "[main](group_customer_id)[customers]|income|identity":
                it["income"],
                "[main](group_customer_id)[customers]|address|identity":
                it["address"],
                "[main](group_customer_id)[customers]|age|identity":
                it["age"],
            },
            remainder="drop",
        )
        pipeline_4 = join >> map
        scan_1 = Scan(table=it["purchase"])
        join_0 = Join(
            pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])],
            join_limit=50.0,
        )
        aggregate = Aggregate(
            columns={
                "[main](group_id)[purchase]|price|variance":
                variance(it["price"]),
                "[main](group_id)[purchase]|time|sum": sum(it["time"]),
                "[main](group_id)[purchase]|time|mean": mean(it["time"]),
                "[main](group_id)[purchase]|time|min": min(it["time"]),
                "[main](group_id)[purchase]|price|sum": sum(it["price"]),
                "[main](group_id)[purchase]|price|count": count(it["price"]),
                "[main](group_id)[purchase]|price|mean": mean(it["price"]),
                "[main](group_id)[purchase]|price|min": min(it["price"]),
                "[main](group_id)[purchase]|price|max": max(it["price"]),
                "[main](group_id)[purchase]|time|max": max(it["time"]),
                "[main](group_id)[purchase]|time|variance":
                variance(it["time"]),
            },
            group_by=it["row_id"],
        )
        pipeline_5 = join_0 >> aggregate
        map_0 = Map(
            columns={
                "[main]|group_customer_id|identity": it["group_customer_id"],
                "[main]|transaction_id|identity": it["transaction_id"],
                "[main]|group_id|identity": it["group_id"],
                "[main]|comments|identity": it["comments"],
                "[main]|id|identity": it["id"],
                "prefix_0_id": it["prefix_0_id"],
                "next_purchase": it["next_purchase"],
                "[main]|time|identity": it["time"],
            },
            remainder="drop",
        )
        scan_2 = Scan(table=it["transactions"])
        scan_3 = Scan(table=it["products"])
        join_1 = Join(pred=[
            (it["main"]["transaction_id"] == it["transactions"]
             ["transaction_id"]),
            (it["transactions"]["product_id"] == it["products"]["product_id"]),
        ])
        map_1 = Map(
            columns={
                "[main](transaction_id)[transactions](product_id)[products]|price|identity":
                it["price"],
                "[main](transaction_id)[transactions](product_id)[products]|type|identity":
                it["type"],
            },
            remainder="drop",
        )
        pipeline_6 = join_1 >> map_1
        join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"]
                             ["transaction_id"])])
        map_2 = Map(
            columns={
                "[main](transaction_id)[transactions]|description|identity":
                it["description"],
                "[main](transaction_id)[transactions]|product_id|identity":
                it["product_id"],
            },
            remainder="drop",
        )
        pipeline_7 = join_2 >> map_2
        map_3 = Map(columns=[
            string_indexer(it["[main]|comments|identity"]),
            string_indexer(
                it["[main](transaction_id)[transactions]|description|identity"]
            ),
            string_indexer(it[
                "[main](transaction_id)[transactions](product_id)[products]|type|identity"]
                           ),
            string_indexer(
                it["[main](group_customer_id)[customers]|name|identity"]),
            string_indexer(
                it["[main](group_customer_id)[customers]|address|identity"]),
        ])
        pipeline_8 = ConcatFeatures() >> map_3
        relational = Relational(operator=make_pipeline_graph(
            steps=[
                scan,
                scan_0,
                pipeline_4,
                scan_1,
                pipeline_5,
                map_0,
                scan_2,
                scan_3,
                pipeline_6,
                pipeline_7,
                pipeline_8,
            ],
            edges=[
                (scan, pipeline_4),
                (scan, pipeline_5),
                (scan, map_0),
                (scan, pipeline_6),
                (scan, pipeline_7),
                (scan_0, pipeline_4),
                (pipeline_4, pipeline_8),
                (scan_1, pipeline_5),
                (pipeline_5, pipeline_8),
                (map_0, pipeline_8),
                (scan_2, pipeline_6),
                (scan_2, pipeline_7),
                (scan_3, pipeline_6),
                (pipeline_6, pipeline_8),
                (pipeline_7, pipeline_8),
            ],
        ))
        pipeline = relational >> (KNeighborsClassifier | LogisticRegression)
        from sklearn.datasets import load_iris

        X, y = load_iris(return_X_y=True)
        from lale.lib.lale import Hyperopt

        opt = Hyperopt(estimator=pipeline, max_evals=2)
        opt.fit(X, y)
Beispiel #11
0
 def test_init(self):
     _ = Join(pred=[it.main.train_id == it.info.TrainId], join_type="inner")