def _get_rdd(self, db, scan_fun): sc = SparkContext() # set job group sc.setJobGroup(self.job_id, self.job['type']) # Calculate number of partitions based on number of segments count = db.count_scan(self.get_langs(), self.job['params'].get('filter')) num_partitions = self.calc_num_parititions(count) logging.warning("Scan size: {}, number of partitions: {}".format( count, num_partitions)) # Create RDD by parallelizing segments rdd = sc.parallelize(scan_fun, num_partitions) rdd.persist(StorageLevel.DISK_ONLY) return rdd
df=hc.table(str(tablename)) columns_list=df.columns for col in columns_list: Profiler.run(df,col,f) except Exception as e: logging.log(1,'ERROR found while processing the dataset') def profiler2(self, table): from pyspark.sql.function import * print("PROCESSING TABLE: "+ str(table)) table.persist() for coll in table.columns: print("Processing column "+ coll + " in table " + str(table)) table.groupBy(coll).agg(count(coll).alias('c')).orderBy(col('c').desc()).show(10,1000) print("Finding max length fro column: "+ coll) table.rdd.map(lambda x: len(str(x[column]))).reduce(lambda x,y: x if x>y else y).take(10) print("Finding min length fro column: "+ coll) table.rdd.map(lambda x: len(str(x[column]))).reduce(lambda x,y: x if x<y else y).take(10) table.unpersist() if __name__ == '__main__': conf = SparkConf().setAppName(job_name) sc = SparkContext(conf=conf) sc.setJobGroup(job_name, "PYTHON PROFILER") hc=SparkSession.builder.enableHiveSupport().getOrCreate() #hc=HiveContext(sc) prof=Profiler prof.tableColumns(str(db),str(table))