Python SparkContext.setLocalProperty Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark

Klasse / Typ: SparkContext

Methode / Funktion: setLocalProperty

Beispiele auf hotexamples.com: 6

Python SparkContext.setLocalProperty - 6 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.SparkContext.setLocalProperty, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Beispiel #1

Datei anzeigen

Datei: drowsy_streaming.py Projekt: cloud17shield/DrunkDetection

from pyspark.sql import SQLContext

from imutils import face_utils
from scipy.spatial import distance
import numpy as np
import imutils
import dlib
import cv2
import time

conf = SparkConf().setAppName("drowsy streaming").setMaster("yarn")
conf.set("spark.scheduler.mode", "FAIR")
conf.set("spark.scheduler.allocation.file",
         "/opt/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml")
sc = SparkContext(conf=conf)
sc.setLocalProperty("spark.scheduler.pool", "pool1")
ssc = StreamingContext(sc, 0.5)
sql_sc = SQLContext(sc)
input_topic = 'input'
output_topic = 'output1'
brokers = "G01-01:2181,G01-02:2181,G01-03:2181,G01-04:2181,G01-05:2181,G01-06:2181,G01-07:2181,G01-08:2181," \
          "G01-09:2181,G01-10:2181,G01-11:2181,G01-12:2181,G01-13:2181,G01-14:2181,G01-15:2181,G01-16:2181"
predictor_path = "/home/hduser/DrunkDetection/shape_predictor_68_face_landmarks.dat"


def my_decoder(s):
    return s


def eye_aspect_ratio(eye):
    A = distance.euclidean(eye[1], eye[5])

Beispiel #2

Datei anzeigen

class TaskContextTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        # Allow retries even though they are normally disabled in local mode
        self.sc = SparkContext('local[4, 2]', class_name)

    def test_stage_id(self):
        """Test the stage ids are available and incrementing as expected."""
        rdd = self.sc.parallelize(range(10))
        stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
        stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
        # Test using the constructor directly rather than the get()
        stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
        self.assertEqual(stage1 + 1, stage2)
        self.assertEqual(stage1 + 2, stage3)
        self.assertEqual(stage2 + 1, stage3)

    def test_resources(self):
        """Test the resources are empty by default."""
        rdd = self.sc.parallelize(range(10))
        resources1 = rdd.map(lambda x: TaskContext.get().resources()).take(1)[0]
        # Test using the constructor directly rather than the get()
        resources2 = rdd.map(lambda x: TaskContext().resources()).take(1)[0]
        self.assertEqual(len(resources1), 0)
        self.assertEqual(len(resources2), 0)

    def test_partition_id(self):
        """Test the partition id."""
        rdd1 = self.sc.parallelize(range(10), 1)
        rdd2 = self.sc.parallelize(range(10), 2)
        pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect()
        pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect()
        self.assertEqual(0, pids1[0])
        self.assertEqual(0, pids1[9])
        self.assertEqual(0, pids2[0])
        self.assertEqual(1, pids2[9])

    def test_attempt_number(self):
        """Verify the attempt numbers are correctly reported."""
        rdd = self.sc.parallelize(range(10))
        # Verify a simple job with no failures
        attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect()
        map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)

        def fail_on_first(x):
            """Fail on the first attempt so we get a positive attempt number"""
            tc = TaskContext.get()
            attempt_number = tc.attemptNumber()
            partition_id = tc.partitionId()
            attempt_id = tc.taskAttemptId()
            if attempt_number == 0 and partition_id == 0:
                raise Exception("Failing on first attempt")
            else:
                return [x, partition_id, attempt_number, attempt_id]
        result = rdd.map(fail_on_first).collect()
        # We should re-submit the first partition to it but other partitions should be attempt 0
        self.assertEqual([0, 0, 1], result[0][0:3])
        self.assertEqual([9, 3, 0], result[9][0:3])
        first_partition = filter(lambda x: x[1] == 0, result)
        map(lambda x: self.assertEqual(1, x[2]), first_partition)
        other_partitions = filter(lambda x: x[1] != 0, result)
        map(lambda x: self.assertEqual(0, x[2]), other_partitions)
        # The task attempt id should be different
        self.assertTrue(result[0][3] != result[9][3])

    def test_tc_on_driver(self):
        """Verify that getting the TaskContext on the driver returns None."""
        tc = TaskContext.get()
        self.assertTrue(tc is None)

    def test_get_local_property(self):
        """Verify that local properties set on the driver are available in TaskContext."""
        key = "testkey"
        value = "testvalue"
        self.sc.setLocalProperty(key, value)
        try:
            rdd = self.sc.parallelize(range(1), 1)
            prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0]
            self.assertEqual(prop1, value)
            prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0]
            self.assertTrue(prop2 is None)
        finally:
            self.sc.setLocalProperty(key, None)

    def test_barrier(self):
        """
        Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks
        within a stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        def context_barrier(x):
            tc = BarrierTaskContext.get()
            time.sleep(random.randint(1, 10))
            tc.barrier()
            return time.time()

        times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
        self.assertTrue(max(times) - min(times) < 1)

    def test_all_gather(self):
        """
        Verify that BarrierTaskContext.allGather() performs global sync among all barrier tasks
        within a stage and passes messages properly.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        def context_barrier(x):
            tc = BarrierTaskContext.get()
            time.sleep(random.randint(1, 10))
            out = tc.allGather(str(tc.partitionId()))
            pids = [int(e) for e in out]
            return pids

        pids = rdd.barrier().mapPartitions(f).map(context_barrier).collect()[0]
        self.assertEqual(pids, [0, 1, 2, 3])

    def test_barrier_infos(self):
        """
        Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the
        barrier stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        taskInfos = rdd.barrier().mapPartitions(f).map(lambda x: BarrierTaskContext.get()
                                                       .getTaskInfos()).collect()
        self.assertTrue(len(taskInfos) == 4)
        self.assertTrue(len(taskInfos[0]) == 4)

    def test_context_get(self):
        """
        Verify that TaskContext.get() works both in or not in a barrier stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            taskContext = TaskContext.get()
            if isinstance(taskContext, BarrierTaskContext):
                yield taskContext.partitionId() + 1
            elif isinstance(taskContext, TaskContext):
                yield taskContext.partitionId() + 2
            else:
                yield -1

        # for normal stage
        result1 = rdd.mapPartitions(f).collect()
        self.assertTrue(result1 == [2, 3, 4, 5])
        # for barrier stage
        result2 = rdd.barrier().mapPartitions(f).collect()
        self.assertTrue(result2 == [1, 2, 3, 4])

    def test_barrier_context_get(self):
        """
        Verify that BarrierTaskContext.get() should only works in a barrier stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            try:
                taskContext = BarrierTaskContext.get()
            except Exception:
                yield -1
            else:
                yield taskContext.partitionId()

        # for normal stage
        result1 = rdd.mapPartitions(f).collect()
        self.assertTrue(result1 == [-1, -1, -1, -1])
        # for barrier stage
        result2 = rdd.barrier().mapPartitions(f).collect()
        self.assertTrue(result2 == [0, 1, 2, 3])

Beispiel #3

Datei anzeigen

Datei: test_taskcontext.py Projekt: JingchengDu/spark

class TaskContextTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        # Allow retries even though they are normally disabled in local mode
        self.sc = SparkContext('local[4, 2]', class_name)

    def test_stage_id(self):
        """Test the stage ids are available and incrementing as expected."""
        rdd = self.sc.parallelize(range(10))
        stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
        stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
        # Test using the constructor directly rather than the get()
        stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
        self.assertEqual(stage1 + 1, stage2)
        self.assertEqual(stage1 + 2, stage3)
        self.assertEqual(stage2 + 1, stage3)

    def test_partition_id(self):
        """Test the partition id."""
        rdd1 = self.sc.parallelize(range(10), 1)
        rdd2 = self.sc.parallelize(range(10), 2)
        pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect()
        pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect()
        self.assertEqual(0, pids1[0])
        self.assertEqual(0, pids1[9])
        self.assertEqual(0, pids2[0])
        self.assertEqual(1, pids2[9])

    def test_attempt_number(self):
        """Verify the attempt numbers are correctly reported."""
        rdd = self.sc.parallelize(range(10))
        # Verify a simple job with no failures
        attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect()
        map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)

        def fail_on_first(x):
            """Fail on the first attempt so we get a positive attempt number"""
            tc = TaskContext.get()
            attempt_number = tc.attemptNumber()
            partition_id = tc.partitionId()
            attempt_id = tc.taskAttemptId()
            if attempt_number == 0 and partition_id == 0:
                raise Exception("Failing on first attempt")
            else:
                return [x, partition_id, attempt_number, attempt_id]
        result = rdd.map(fail_on_first).collect()
        # We should re-submit the first partition to it but other partitions should be attempt 0
        self.assertEqual([0, 0, 1], result[0][0:3])
        self.assertEqual([9, 3, 0], result[9][0:3])
        first_partition = filter(lambda x: x[1] == 0, result)
        map(lambda x: self.assertEqual(1, x[2]), first_partition)
        other_partitions = filter(lambda x: x[1] != 0, result)
        map(lambda x: self.assertEqual(0, x[2]), other_partitions)
        # The task attempt id should be different
        self.assertTrue(result[0][3] != result[9][3])

    def test_tc_on_driver(self):
        """Verify that getting the TaskContext on the driver returns None."""
        tc = TaskContext.get()
        self.assertTrue(tc is None)

    def test_get_local_property(self):
        """Verify that local properties set on the driver are available in TaskContext."""
        key = "testkey"
        value = "testvalue"
        self.sc.setLocalProperty(key, value)
        try:
            rdd = self.sc.parallelize(range(1), 1)
            prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0]
            self.assertEqual(prop1, value)
            prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0]
            self.assertTrue(prop2 is None)
        finally:
            self.sc.setLocalProperty(key, None)

    def test_barrier(self):
        """
        Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks
        within a stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        def context_barrier(x):
            tc = BarrierTaskContext.get()
            time.sleep(random.randint(1, 10))
            tc.barrier()
            return time.time()

        times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
        self.assertTrue(max(times) - min(times) < 1)

    def test_barrier_with_python_worker_reuse(self):
        """
        Verify that BarrierTaskContext.barrier() with reused python worker.
        """
        self.sc._conf.set("spark.python.work.reuse", "true")
        rdd = self.sc.parallelize(range(4), 4)
        # start a normal job first to start all worker
        result = rdd.map(lambda x: x ** 2).collect()
        self.assertEqual([0, 1, 4, 9], result)
        # make sure `spark.python.work.reuse=true`
        self.assertEqual(self.sc._conf.get("spark.python.work.reuse"), "true")

        # worker will be reused in this barrier job
        self.test_barrier()

    def test_barrier_infos(self):
        """
        Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the
        barrier stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        taskInfos = rdd.barrier().mapPartitions(f).map(lambda x: BarrierTaskContext.get()
                                                       .getTaskInfos()).collect()
        self.assertTrue(len(taskInfos) == 4)
        self.assertTrue(len(taskInfos[0]) == 4)

Beispiel #4

Datei anzeigen

Datei: sparkcl_tmp2.py Projekt: sleepnezz/sparkcl

    prg = cl.Program(ctx, KERNEL_CODE).build()
    kernel = prg.Mul2
    mf = cl.mem_flags
    print "map" + str(data)

    np_data = []
    data_buf = []
    np_data.append(np.array(data[0]).astype(np.float32))
    data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[0]))

    result = np.zeros((5,)).astype(np.float32)
    result_buf = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes)

    kernel(queue,(5,),None,data_buf[0],result_buf)
    cl.enqueue_read_buffer(queue, result_buf, result).wait()
    return [result.astype(np.float32)]



if __name__ == "__main__" :
	try:
		conf = SparkConf().setAppName("MyApp").setMaster("spark://10.20.21.194:7077")
		conf.set("spark.scheduler.allocation.file", "conf/fairscheduler.xml")
		sc = SparkContext(conf=conf)
		sc.setLocalProperty("spark.scheduler.pool", "pool1")
		distData = sc.parallelize([[[[1,2,3,4,5],[1,2,3,4,5]]]]).map(map1).map(map2)
		result = distData.collect()
		for r in result :
			print r[0].tolist()
	except:
		raise

Beispiel #5

Datei anzeigen

class TaskContextTests(PySparkTestCase):
    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        # Allow retries even though they are normally disabled in local mode
        self.sc = SparkContext('local[4, 2]', class_name)

    def test_stage_id(self):
        """Test the stage ids are available and incrementing as expected."""
        rdd = self.sc.parallelize(range(10))
        stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
        stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
        # Test using the constructor directly rather than the get()
        stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
        self.assertEqual(stage1 + 1, stage2)
        self.assertEqual(stage1 + 2, stage3)
        self.assertEqual(stage2 + 1, stage3)

    def test_partition_id(self):
        """Test the partition id."""
        rdd1 = self.sc.parallelize(range(10), 1)
        rdd2 = self.sc.parallelize(range(10), 2)
        pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect()
        pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect()
        self.assertEqual(0, pids1[0])
        self.assertEqual(0, pids1[9])
        self.assertEqual(0, pids2[0])
        self.assertEqual(1, pids2[9])

    def test_attempt_number(self):
        """Verify the attempt numbers are correctly reported."""
        rdd = self.sc.parallelize(range(10))
        # Verify a simple job with no failures
        attempt_numbers = rdd.map(
            lambda x: TaskContext.get().attemptNumber()).collect()
        map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)

        def fail_on_first(x):
            """Fail on the first attempt so we get a positive attempt number"""
            tc = TaskContext.get()
            attempt_number = tc.attemptNumber()
            partition_id = tc.partitionId()
            attempt_id = tc.taskAttemptId()
            if attempt_number == 0 and partition_id == 0:
                raise Exception("Failing on first attempt")
            else:
                return [x, partition_id, attempt_number, attempt_id]

        result = rdd.map(fail_on_first).collect()
        # We should re-submit the first partition to it but other partitions should be attempt 0
        self.assertEqual([0, 0, 1], result[0][0:3])
        self.assertEqual([9, 3, 0], result[9][0:3])
        first_partition = filter(lambda x: x[1] == 0, result)
        map(lambda x: self.assertEqual(1, x[2]), first_partition)
        other_partitions = filter(lambda x: x[1] != 0, result)
        map(lambda x: self.assertEqual(0, x[2]), other_partitions)
        # The task attempt id should be different
        self.assertTrue(result[0][3] != result[9][3])

    def test_tc_on_driver(self):
        """Verify that getting the TaskContext on the driver returns None."""
        tc = TaskContext.get()
        self.assertTrue(tc is None)

    def test_get_local_property(self):
        """Verify that local properties set on the driver are available in TaskContext."""
        key = "testkey"
        value = "testvalue"
        self.sc.setLocalProperty(key, value)
        try:
            rdd = self.sc.parallelize(range(1), 1)
            prop1 = rdd.map(
                lambda _: TaskContext.get().getLocalProperty(key)).collect()[0]
            self.assertEqual(prop1, value)
            prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty(
                "otherkey")).collect()[0]
            self.assertTrue(prop2 is None)
        finally:
            self.sc.setLocalProperty(key, None)

    def test_barrier(self):
        """
        Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks
        within a stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        def context_barrier(x):
            tc = BarrierTaskContext.get()
            time.sleep(random.randint(1, 10))
            tc.barrier()
            return time.time()

        times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
        self.assertTrue(max(times) - min(times) < 1)

    def test_barrier_with_python_worker_reuse(self):
        """
        Verify that BarrierTaskContext.barrier() with reused python worker.
        """
        self.sc._conf.set("spark.python.work.reuse", "true")
        rdd = self.sc.parallelize(range(4), 4)
        # start a normal job first to start all worker
        result = rdd.map(lambda x: x**2).collect()
        self.assertEqual([0, 1, 4, 9], result)
        # make sure `spark.python.work.reuse=true`
        self.assertEqual(self.sc._conf.get("spark.python.work.reuse"), "true")

        # worker will be reused in this barrier job
        self.test_barrier()

    def test_barrier_infos(self):
        """
        Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the
        barrier stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        taskInfos = rdd.barrier().mapPartitions(f).map(
            lambda x: BarrierTaskContext.get().getTaskInfos()).collect()
        self.assertTrue(len(taskInfos) == 4)
        self.assertTrue(len(taskInfos[0]) == 4)

Beispiel #6

Datei anzeigen

Datei: engine.py Projekt: jleaniz/bdsa

def init_spark_context():
    appConfig = conf.Config(exec_cores=8, cores_max=24, yarn_cores=2, instances=6, queue='root.world-bdp-srm-analysts')
    sc = SparkContext(conf=appConfig.setSparkConf())
    # set resource pool
    sc.setLocalProperty("spark.scheduler.pool", "default")
    return sc