Python SparkContext.setJobGroup Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark

Klasse / Typ: SparkContext

Methode / Funktion: setJobGroup

Beispiele auf hotexamples.com: 2

Python SparkContext.setJobGroup - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.SparkContext.setJobGroup, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Beispiel #1

Datei anzeigen

 def _get_rdd(self, db, scan_fun):
     sc = SparkContext()
     # set job group
     sc.setJobGroup(self.job_id, self.job['type'])
     # Calculate number of partitions based on number of segments
     count = db.count_scan(self.get_langs(),
                           self.job['params'].get('filter'))
     num_partitions = self.calc_num_parititions(count)
     logging.warning("Scan size: {}, number of partitions: {}".format(
         count, num_partitions))  # Create RDD by parallelizing segments
     rdd = sc.parallelize(scan_fun, num_partitions)
     rdd.persist(StorageLevel.DISK_ONLY)
     return rdd

Beispiel #2

Datei anzeigen

                                df=hc.table(str(tablename))
                                columns_list=df.columns
                                for col in columns_list:
                                        Profiler.run(df,col,f)
                except Exception as e:
                        logging.log(1,'ERROR found while processing the dataset')


        def profiler2(self, table):
                from pyspark.sql.function import *
                print("PROCESSING TABLE: "+ str(table))
                table.persist()
                for coll in table.columns:
                        print("Processing column "+ coll + " in table " + str(table))
                        table.groupBy(coll).agg(count(coll).alias('c')).orderBy(col('c').desc()).show(10,1000)
                        print("Finding max length fro column: "+ coll)
                        table.rdd.map(lambda x: len(str(x[column]))).reduce(lambda x,y: x if x>y else y).take(10)
                        print("Finding min length fro column: "+ coll)
                        table.rdd.map(lambda x: len(str(x[column]))).reduce(lambda x,y: x if x<y else y).take(10)
                table.unpersist()

if __name__ == '__main__':
        conf = SparkConf().setAppName(job_name)
        sc = SparkContext(conf=conf)
        sc.setJobGroup(job_name, "PYTHON PROFILER")
        hc=SparkSession.builder.enableHiveSupport().getOrCreate()
        #hc=HiveContext(sc)
        prof=Profiler
        prof.tableColumns(str(db),str(table))