def trick3(self): df = self.session.range(0, 1000000).select( "id", F.rand(seed=10).alias("uniform"), F.randn(seed=27).alias("normal")) # 更少的内存和更快的速度 TimeProfile.profile(lambda: df.toPandas())() TimeProfile.print_prof_data(clear=True)
def trick1(self): df = self.session.range(0, 1000000).select( "id", F.rand(seed=10).alias("uniform"), F.randn(seed=27).alias("normal")) @F.udf('double') def plus_one(v): return v + 1 TimeProfile.profile( lambda: df.withColumn('v2', plus_one(df.uniform)).count())() TimeProfile.print_prof_data(clear=True) @F.pandas_udf('double', F.PandasUDFType.SCALAR) def pandas_plus_one(v): return v + 1 TimeProfile.profile( lambda: df.withColumn('v2', pandas_plus_one(df.uniform)).count())() TimeProfile.print_prof_data(clear=True)
def broadcast(self): document_df = self.session.read.csv(self.dataDir + "/data/FL_insurance_sample.csv", encoding="utf-8", header=True) # almost 100m huge_dic = dict([(i, bytearray(1024 * 8 * 1024)) for i in range(100)]) def run(): def m(index): return huge_dic[index] newdf = document_df.select(F.udf(m)(F.lit(1))) [newdf.count() for i in range(10)] TimeProfile.profile(run)() TimeProfile.print_prof_data(clear=True) huge_dic_br = self.sc.broadcast(huge_dic) def run2(): def m(index): return huge_dic_br.value[index] newdf = document_df.select(F.udf(m)(F.lit(1))) [newdf.count() for i in range(10)] TimeProfile.profile(run2)() TimeProfile.print_prof_data(clear=True) TimeProfile.profile(run)() TimeProfile.print_prof_data(clear=True)
def cache_persist(self): document_df = self.session.read.csv(self.dataDir + "/data/FL_insurance_sample.csv", encoding="utf-8", header=True) def concat(a, b): return str(a) + str(b) new_d_df = document_df.select( F.udf(concat)(F.col("policyID"), F.col("statecode"))) TimeProfile.profile(lambda: new_d_df.count())() logger.warn("normal:") TimeProfile.print_prof_data(clear=True) logger.warn("cached:") new_d_df.cache() new_d_df.count() TimeProfile.profile(lambda: new_d_df.count())() TimeProfile.print_prof_data(clear=True) logger.warn("unpersist:") new_d_df.unpersist() TimeProfile.profile(lambda: new_d_df.count())() TimeProfile.print_prof_data(clear=True)