Exemple #1
0
 def trick3(self):
     df = self.session.range(0, 1000000).select(
         "id",
         F.rand(seed=10).alias("uniform"),
         F.randn(seed=27).alias("normal"))
     # 更少的内存和更快的速度
     TimeProfile.profile(lambda: df.toPandas())()
     TimeProfile.print_prof_data(clear=True)
Exemple #2
0
    def trick1(self):
        df = self.session.range(0, 1000000).select(
            "id",
            F.rand(seed=10).alias("uniform"),
            F.randn(seed=27).alias("normal"))

        @F.udf('double')
        def plus_one(v):
            return v + 1

        TimeProfile.profile(
            lambda: df.withColumn('v2', plus_one(df.uniform)).count())()
        TimeProfile.print_prof_data(clear=True)

        @F.pandas_udf('double', F.PandasUDFType.SCALAR)
        def pandas_plus_one(v):
            return v + 1

        TimeProfile.profile(
            lambda: df.withColumn('v2', pandas_plus_one(df.uniform)).count())()
        TimeProfile.print_prof_data(clear=True)
Exemple #3
0
    def broadcast(self):
        document_df = self.session.read.csv(self.dataDir +
                                            "/data/FL_insurance_sample.csv",
                                            encoding="utf-8",
                                            header=True)
        # almost 100m
        huge_dic = dict([(i, bytearray(1024 * 8 * 1024)) for i in range(100)])

        def run():
            def m(index):
                return huge_dic[index]

            newdf = document_df.select(F.udf(m)(F.lit(1)))
            [newdf.count() for i in range(10)]

        TimeProfile.profile(run)()
        TimeProfile.print_prof_data(clear=True)

        huge_dic_br = self.sc.broadcast(huge_dic)

        def run2():
            def m(index):
                return huge_dic_br.value[index]

            newdf = document_df.select(F.udf(m)(F.lit(1)))
            [newdf.count() for i in range(10)]

        TimeProfile.profile(run2)()
        TimeProfile.print_prof_data(clear=True)

        TimeProfile.profile(run)()
        TimeProfile.print_prof_data(clear=True)
Exemple #4
0
    def cache_persist(self):
        document_df = self.session.read.csv(self.dataDir +
                                            "/data/FL_insurance_sample.csv",
                                            encoding="utf-8",
                                            header=True)

        def concat(a, b):
            return str(a) + str(b)

        new_d_df = document_df.select(
            F.udf(concat)(F.col("policyID"), F.col("statecode")))

        TimeProfile.profile(lambda: new_d_df.count())()
        logger.warn("normal:")
        TimeProfile.print_prof_data(clear=True)

        logger.warn("cached:")
        new_d_df.cache()
        new_d_df.count()
        TimeProfile.profile(lambda: new_d_df.count())()
        TimeProfile.print_prof_data(clear=True)

        logger.warn("unpersist:")
        new_d_df.unpersist()
        TimeProfile.profile(lambda: new_d_df.count())()
        TimeProfile.print_prof_data(clear=True)