def test_benchmark_pandas(self): imdb = fetch_imdb_dataset() start_time = time.time() trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id], join_type="inner") transformed_df = trainable.transform(imdb) trainable = Filter(pred=[it["director_id"] == 8]) filtered_df = trainable.transform(transformed_df) self.assertEqual(filtered_df.shape, (35, 6)) join_first = time.time() - start_time logger.info( " Pandas: Join Before Filter --- {} seconds".format(join_first)) movies_directors = imdb[4] self.assertEqual(get_table_name(movies_directors), "movies_directors") start_time = time.time() trainable = Filter(pred=[it["director_id"] == 8]) filtered_df = trainable.transform(movies_directors) self.assertEqual(get_table_name(filtered_df), "movies_directors") imdb.pop(4) imdb.append(filtered_df) trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id], join_type="inner") transformed_df = trainable.transform(imdb) self.assertEqual(transformed_df.shape, (35, 6)) filter_first = time.time() - start_time logger.info( " Pandas: Join After Filter --- {} seconds".format(filter_first))
def test_benchmark_join_before_filter_spark(self): if spark_installed: imdb = fetch_imdb_dataset("spark") start_time = time.time() trainable = Join( pred=[it.movies_directors.movie_id == it.movies.id], join_type="inner") transformed_df = trainable.transform(imdb) trainable = Filter(pred=[it["director_id"] == 8]) filtered_df = trainable.transform(transformed_df) self.assertEqual(filtered_df.count(), 35) self.assertEqual(len(filtered_df.columns), 6) return time.time() - start_time
def test_join_pandas_imdb(self): imdb = fetch_imdb_dataset() trainable = Join( pred=[ it.movies_directors.movie_id == it.movies_genres.movie_id, it.movies_genres.movie_id == it.movies.id, it.movies_directors.movie_id == it.roles.movie_id, ], join_type="left", ) transformed_df = trainable.transform(imdb) self.assertEqual(transformed_df.shape, (6062848, 9)) self.assertEqual(transformed_df["movie_id"][1], 281325)
def test_join_spark_imdb(self): if spark_installed: imdb = fetch_imdb_dataset("spark") trainable = Join( pred=[ it.movies_directors.movie_id == it.movies_genres.movie_id, it.movies_genres.movie_id == it.movies.id, it.movies_directors.movie_id == it.roles.movie_id, ], join_type="left", ) transformed_df = trainable.transform(imdb) self.assertEqual(transformed_df.count(), 6062848) self.assertEqual(len(transformed_df.columns), 9)