Example #1
0
    def test_benchmark_pandas(self):
        imdb = fetch_imdb_dataset()

        start_time = time.time()
        trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id],
                         join_type="inner")
        transformed_df = trainable.transform(imdb)
        trainable = Filter(pred=[it["director_id"] == 8])
        filtered_df = trainable.transform(transformed_df)
        self.assertEqual(filtered_df.shape, (35, 6))
        join_first = time.time() - start_time
        logger.info(
            " Pandas: Join Before Filter --- {} seconds".format(join_first))

        movies_directors = imdb[4]
        self.assertEqual(get_table_name(movies_directors), "movies_directors")
        start_time = time.time()
        trainable = Filter(pred=[it["director_id"] == 8])
        filtered_df = trainable.transform(movies_directors)
        self.assertEqual(get_table_name(filtered_df), "movies_directors")
        imdb.pop(4)
        imdb.append(filtered_df)
        trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id],
                         join_type="inner")
        transformed_df = trainable.transform(imdb)
        self.assertEqual(transformed_df.shape, (35, 6))
        filter_first = time.time() - start_time
        logger.info(
            " Pandas: Join After Filter --- {} seconds".format(filter_first))
Example #2
0
 def test_benchmark_join_before_filter_spark(self):
     if spark_installed:
         imdb = fetch_imdb_dataset("spark")
         start_time = time.time()
         trainable = Join(
             pred=[it.movies_directors.movie_id == it.movies.id],
             join_type="inner")
         transformed_df = trainable.transform(imdb)
         trainable = Filter(pred=[it["director_id"] == 8])
         filtered_df = trainable.transform(transformed_df)
         self.assertEqual(filtered_df.count(), 35)
         self.assertEqual(len(filtered_df.columns), 6)
         return time.time() - start_time
Example #3
0
 def test_join_pandas_imdb(self):
     imdb = fetch_imdb_dataset()
     trainable = Join(
         pred=[
             it.movies_directors.movie_id == it.movies_genres.movie_id,
             it.movies_genres.movie_id == it.movies.id,
             it.movies_directors.movie_id == it.roles.movie_id,
         ],
         join_type="left",
     )
     transformed_df = trainable.transform(imdb)
     self.assertEqual(transformed_df.shape, (6062848, 9))
     self.assertEqual(transformed_df["movie_id"][1], 281325)
Example #4
0
 def test_join_spark_imdb(self):
     if spark_installed:
         imdb = fetch_imdb_dataset("spark")
         trainable = Join(
             pred=[
                 it.movies_directors.movie_id == it.movies_genres.movie_id,
                 it.movies_genres.movie_id == it.movies.id,
                 it.movies_directors.movie_id == it.roles.movie_id,
             ],
             join_type="left",
         )
         transformed_df = trainable.transform(imdb)
         self.assertEqual(transformed_df.count(), 6062848)
         self.assertEqual(len(transformed_df.columns), 9)