Beispiel #1
0
 def _get_rdd(self, db, scan_fun):
     sc = SparkContext()
     # set job group
     sc.setJobGroup(self.job_id, self.job['type'])
     # Calculate number of partitions based on number of segments
     count = db.count_scan(self.get_langs(),
                           self.job['params'].get('filter'))
     num_partitions = self.calc_num_parititions(count)
     logging.warning("Scan size: {}, number of partitions: {}".format(
         count, num_partitions))  # Create RDD by parallelizing segments
     rdd = sc.parallelize(scan_fun, num_partitions)
     rdd.persist(StorageLevel.DISK_ONLY)
     return rdd
Beispiel #2
0
                                df=hc.table(str(tablename))
                                columns_list=df.columns
                                for col in columns_list:
                                        Profiler.run(df,col,f)
                except Exception as e:
                        logging.log(1,'ERROR found while processing the dataset')


        def profiler2(self, table):
                from pyspark.sql.function import *
                print("PROCESSING TABLE: "+ str(table))
                table.persist()
                for coll in table.columns:
                        print("Processing column "+ coll + " in table " + str(table))
                        table.groupBy(coll).agg(count(coll).alias('c')).orderBy(col('c').desc()).show(10,1000)
                        print("Finding max length fro column: "+ coll)
                        table.rdd.map(lambda x: len(str(x[column]))).reduce(lambda x,y: x if x>y else y).take(10)
                        print("Finding min length fro column: "+ coll)
                        table.rdd.map(lambda x: len(str(x[column]))).reduce(lambda x,y: x if x<y else y).take(10)
                table.unpersist()

if __name__ == '__main__':
        conf = SparkConf().setAppName(job_name)
        sc = SparkContext(conf=conf)
        sc.setJobGroup(job_name, "PYTHON PROFILER")
        hc=SparkSession.builder.enableHiveSupport().getOrCreate()
        #hc=HiveContext(sc)
        prof=Profiler
        prof.tableColumns(str(db),str(table))