def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) cached = left.join(right.hint("broadcast"), left.a == right.r_a, join_type).cache() cached.count() return cached
def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) cached = left.join(right, left.a == right.r_a, join_type).cache() cached.count() # populates cache return cached
def do_join(spark): left, right = create_df(spark, data, 500, 500) cached = left.join(right, left.a == right.r_a, join_type).cache() cached.count() #populates the cache return cached.filter("a is not null")
def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) cached = left.crossJoin(right.hint("broadcast")).cache() cached.count() return cached