Esempio n. 1
0
 def test_partition_id(self):
     """Test the partition id."""
     rdd1 = self.sc.parallelize(range(10), 1)
     rdd2 = self.sc.parallelize(range(10), 2)
     pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect()
     pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect()
     self.assertEqual(0, pids1[0])
     self.assertEqual(0, pids1[9])
     self.assertEqual(0, pids2[0])
     self.assertEqual(1, pids2[9])
Esempio n. 2
0
 def test_stage_id(self):
     """Test the stage ids are available and incrementing as expected."""
     rdd = self.sc.parallelize(range(10))
     stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     # Test using the constructor directly rather than the get()
     stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
     self.assertEqual(stage1 + 1, stage2)
     self.assertEqual(stage1 + 2, stage3)
     self.assertEqual(stage2 + 1, stage3)
Esempio n. 3
0
 def test_partition_id(self):
     """Test the partition id."""
     rdd1 = self.sc.parallelize(range(10), 1)
     rdd2 = self.sc.parallelize(range(10), 2)
     pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect()
     pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect()
     self.assertEqual(0, pids1[0])
     self.assertEqual(0, pids1[9])
     self.assertEqual(0, pids2[0])
     self.assertEqual(1, pids2[9])
Esempio n. 4
0
 def test_stage_id(self):
     """Test the stage ids are available and incrementing as expected."""
     rdd = self.sc.parallelize(range(10))
     stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     # Test using the constructor directly rather than the get()
     stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
     self.assertEqual(stage1 + 1, stage2)
     self.assertEqual(stage1 + 2, stage3)
     self.assertEqual(stage2 + 1, stage3)
Esempio n. 5
0
 def test_get_local_property(self):
     """Verify that local properties set on the driver are available in TaskContext."""
     key = "testkey"
     value = "testvalue"
     self.sc.setLocalProperty(key, value)
     try:
         rdd = self.sc.parallelize(range(1), 1)
         prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0]
         self.assertEqual(prop1, value)
         prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0]
         self.assertTrue(prop2 is None)
     finally:
         self.sc.setLocalProperty(key, None)
Esempio n. 6
0
 def test_get_local_property(self):
     """Verify that local properties set on the driver are available in TaskContext."""
     key = "testkey"
     value = "testvalue"
     self.sc.setLocalProperty(key, value)
     try:
         rdd = self.sc.parallelize(range(1), 1)
         prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0]
         self.assertEqual(prop1, value)
         prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0]
         self.assertTrue(prop2 is None)
     finally:
         self.sc.setLocalProperty(key, None)
Esempio n. 7
0
        def fn(rows):
            import math
            import tensorflow as tf
            import tensorflow.keras.backend as K

            if GPU_INFERENCE_ENABLED:
                from pyspark import TaskContext
                config = tf.ConfigProto()
                config.gpu_options.allow_growth = True
                config.gpu_options.visible_device_list = TaskContext.get().resources()['gpu'].addresses[0]
                K.set_session(tf.Session(config=config))
            else:
                # Do not use GPUs for prediction, use single CPU core per task.
                config = tf.ConfigProto(device_count={'GPU': 0})
                config.inter_op_parallelism_threads = 1
                config.intra_op_parallelism_threads = 1
                K.set_session(tf.Session(config=config))

            # Restore from checkpoint.
            model = deserialize_model(model_bytes, tf.keras.models.load_model)

            # Perform predictions.
            for row in rows:
                fields = row.asDict().copy()
                # Convert from log domain to real Sales numbers.
                log_sales = model.predict_on_batch([[row[col]] for col in all_cols])[0]
                # Add 'Sales' column with prediction results.
                fields['Sales'] = math.exp(log_sales)
                yield Row(**fields)
Esempio n. 8
0
    def test_attempt_number(self):
        """Verify the attempt numbers are correctly reported."""
        rdd = self.sc.parallelize(range(10))
        # Verify a simple job with no failures
        attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect()
        map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)

        def fail_on_first(x):
            """Fail on the first attempt so we get a positive attempt number"""
            tc = TaskContext.get()
            attempt_number = tc.attemptNumber()
            partition_id = tc.partitionId()
            attempt_id = tc.taskAttemptId()
            if attempt_number == 0 and partition_id == 0:
                raise Exception("Failing on first attempt")
            else:
                return [x, partition_id, attempt_number, attempt_id]
        result = rdd.map(fail_on_first).collect()
        # We should re-submit the first partition to it but other partitions should be attempt 0
        self.assertEqual([0, 0, 1], result[0][0:3])
        self.assertEqual([9, 3, 0], result[9][0:3])
        first_partition = filter(lambda x: x[1] == 0, result)
        map(lambda x: self.assertEqual(1, x[2]), first_partition)
        other_partitions = filter(lambda x: x[1] != 0, result)
        map(lambda x: self.assertEqual(0, x[2]), other_partitions)
        # The task attempt id should be different
        self.assertTrue(result[0][3] != result[9][3])
Esempio n. 9
0
    def _process_partition(messages):
        offset = offsets[TaskContext.get().partitionId()]
        result = defaultdict(float)

        for (_, message) in messages:
            price = sum(item['total_price_paid'] for item in message['items'])
            result[message['store_id']] += price

        engine = create_engine(url)

        # avoid transactional deadlock
        result = sorted(result.iteritems())

        with engine.begin() as conn:
            for store_id, price in result:
                conn.execute(text(SALES_UPSERT_QUERY),
                             store_id=store_id,
                             date=timestr,
                             total_sales_price=price)

            conn.execute(text(OFFSET_UPSERT_QEURY),
                         topic=offset.topic,
                         partition=offset.partition,
                         offset=offset.untilOffset)

        return [len(result)]
Esempio n. 10
0
    def test_attempt_number(self):
        """Verify the attempt numbers are correctly reported."""
        rdd = self.sc.parallelize(range(10))
        # Verify a simple job with no failures
        attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect()
        map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)

        def fail_on_first(x):
            """Fail on the first attempt so we get a positive attempt number"""
            tc = TaskContext.get()
            attempt_number = tc.attemptNumber()
            partition_id = tc.partitionId()
            attempt_id = tc.taskAttemptId()
            if attempt_number == 0 and partition_id == 0:
                raise Exception("Failing on first attempt")
            else:
                return [x, partition_id, attempt_number, attempt_id]
        result = rdd.map(fail_on_first).collect()
        # We should re-submit the first partition to it but other partitions should be attempt 0
        self.assertEqual([0, 0, 1], result[0][0:3])
        self.assertEqual([9, 3, 0], result[9][0:3])
        first_partition = filter(lambda x: x[1] == 0, result)
        map(lambda x: self.assertEqual(1, x[2]), first_partition)
        other_partitions = filter(lambda x: x[1] != 0, result)
        map(lambda x: self.assertEqual(0, x[2]), other_partitions)
        # The task attempt id should be different
        self.assertTrue(result[0][3] != result[9][3])
Esempio n. 11
0
        def context(iterator):
            tp = TaskContext.get().partitionId()
            try:
                bp = BarrierTaskContext.get().partitionId()
            except Exception:
                bp = -1

            yield (tp, bp, os.getpid())
Esempio n. 12
0
 def f(iterator):
     taskContext = TaskContext.get()
     if isinstance(taskContext, BarrierTaskContext):
         yield taskContext.partitionId() + 1
     elif isinstance(taskContext, TaskContext):
         yield taskContext.partitionId() + 2
     else:
         yield -1
Esempio n. 13
0
def _get_property_from_spark_context(key):
    try:
        from pyspark import TaskContext  # pylint: disable=import-error
        task_context = TaskContext.get()
        if task_context:
            return task_context.getLocalProperty(key)
    except Exception:  # pylint: disable=broad-except
        return None
Esempio n. 14
0
 def test_resources(self):
     """Test the resources are empty by default."""
     rdd = self.sc.parallelize(range(10))
     resources1 = rdd.map(lambda x: TaskContext.get().resources()).take(1)[0]
     # Test using the constructor directly rather than the get()
     resources2 = rdd.map(lambda x: TaskContext().resources()).take(1)[0]
     self.assertEqual(len(resources1), 0)
     self.assertEqual(len(resources2), 0)
Esempio n. 15
0
 def test_resources(self):
     """Test the resources are available."""
     rdd = self.sc.parallelize(range(10))
     resources = rdd.map(lambda x: TaskContext.get().resources()).take(1)[0]
     self.assertEqual(len(resources), 1)
     self.assertTrue('gpu' in resources)
     self.assertEqual(resources['gpu'].name, 'gpu')
     self.assertEqual(resources['gpu'].addresses, ['0'])
Esempio n. 16
0
 def fail_on_first(x):
     """Fail on the first attempt so we get a positive attempt number"""
     tc = TaskContext.get()
     attempt_number = tc.attemptNumber()
     partition_id = tc.partitionId()
     attempt_id = tc.taskAttemptId()
     if attempt_number == 0 and partition_id == 0:
         raise Exception("Failing on first attempt")
     else:
         return [x, partition_id, attempt_number, attempt_id]
Esempio n. 17
0
 def fail_on_first(x):
     """Fail on the first attempt so we get a positive attempt number"""
     tc = TaskContext.get()
     attempt_number = tc.attemptNumber()
     partition_id = tc.partitionId()
     attempt_id = tc.taskAttemptId()
     if attempt_number == 0 and partition_id == 0:
         raise Exception("Failing on first attempt")
     else:
         return [x, partition_id, attempt_number, attempt_id]
Esempio n. 18
0
def get_partition_attempt_id():
    """Returns partitionId and attemptNumber of the task context, when invoked
    on a spark executor.
    PartitionId is ID of the RDD partition that is computed by this task.
    The first task attempt will be assigned attemptNumber = 0, and subsequent
    attempts will have increasing attempt numbers.
    Returns:
        partitionId, attemptNumber -- [description]
    """
    task_context = TaskContext.get()
    return task_context.partitionId(), task_context.attemptNumber()
Esempio n. 19
0
def _transform_to_slices(rdds):
    taskcontext = TaskContext.get()
    partitionid = taskcontext.partitionId()
    csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
    num_rows = len(csv.index)
    print("working with partition: ", partitionid, max_partition_num, num_rows)
    examples = []
    for start_ind in range(
            0, num_rows,
            batch_size if batch_size is not None else 1):  # for each batch
        if start_ind + batch_size - 1 > num_rows:  # if we'd run out of rows
            csv_slice = csv.iloc[start_ind:]
            print("last Example has: ", len(csv_slice), partitionid)
            examples.append((csv_slice, len(csv_slice)))
            return examples
        else:
            csv_slice = csv.iloc[start_ind:start_ind +
                                 (batch_size if batch_size is not None else 1)]
        examples.append((csv_slice, len(csv_slice)))
    return examples
Esempio n. 20
0
 def test_cpus(self):
     """Test the cpus are available."""
     rdd = self.sc.parallelize(range(10))
     cpus = rdd.map(lambda x: TaskContext.get().cpus()).take(1)[0]
     self.assertEqual(cpus, 2)
Esempio n. 21
0
 def test_tc_on_driver(self):
     """Verify that getting the TaskContext on the driver returns None."""
     tc = TaskContext.get()
     self.assertTrue(tc is None)
Esempio n. 22
0
 def test_tc_on_driver(self):
     """Verify that getting the TaskContext on the driver returns None."""
     tc = TaskContext.get()
     self.assertTrue(tc is None)
Esempio n. 23
0
        def _get_gpus(cluster_spec=None):
            gpus = []
            is_k8s = 'SPARK_EXECUTOR_POD_IP' in os.environ

            # handle explicitly configured tf_args.num_gpus
            if 'num_gpus' in tf_args:
                requested_gpus = tf_args.num_gpus
                user_requested = True
            else:
                requested_gpus = 0
                user_requested = False

            # first, try Spark 3 resources API, returning all visible GPUs
            # note: num_gpus arg is only used (if supplied) to limit/truncate visible devices
            if _has_spark_resource_api():
                from pyspark import TaskContext
                context = TaskContext.get()
                if context:
                    resources = context.resources()
                    if resources and 'gpu' in resources:
                        # get all GPUs assigned by resource manager
                        gpus = context.resources()['gpu'].addresses
                        logger.info("Spark gpu resources: {}".format(gpus))
                        if user_requested:
                            if requested_gpus < len(gpus):
                                # override/truncate list, if explicitly configured
                                logger.warn(
                                    "Requested {} GPU(s), but {} available".
                                    format(requested_gpus, len(gpus)))
                                gpus = gpus[:requested_gpus]
                        else:
                            # implicitly requested by Spark 3
                            requested_gpus = len(gpus)

            # if not in K8s pod and GPUs available, just use original allocation code (defaulting to 1 GPU if available)
            # Note: for K8s, there is a bug with the Nvidia device_plugin which can show GPUs for non-GPU pods that are hosted on GPU nodes
            if not is_k8s and gpu_info.is_gpu_available() and not gpus:
                # default to one GPU if not specified explicitly
                requested_gpus = max(
                    1,
                    requested_gpus) if not user_requested else requested_gpus
                if requested_gpus > 0:
                    if cluster_spec:
                        # compute my index relative to other nodes on the same host (for GPU allocation)
                        my_addr = cluster_spec[job_name][task_index]
                        my_host = my_addr.split(':')[0]
                        flattened = [
                            v for sublist in cluster_spec.values()
                            for v in sublist
                        ]
                        local_peers = [
                            p for p in flattened if p.startswith(my_host)
                        ]
                        my_index = local_peers.index(my_addr)
                    else:
                        my_index = 0

                    # try to allocate a GPU
                    gpus = gpu_info.get_gpus(requested_gpus,
                                             my_index,
                                             format=gpu_info.AS_LIST)

            if user_requested and len(gpus) < requested_gpus:
                raise Exception(
                    "Unable to allocate {} GPU(s) from available GPUs: {}".
                    format(requested_gpus, gpus))

            gpus_to_use = ','.join(gpus)
            if gpus:
                logger.info(
                    "Requested {} GPU(s), setting CUDA_VISIBLE_DEVICES={}".
                    format(requested_gpus if user_requested else len(gpus),
                           gpus_to_use))
            os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use
Esempio n. 24
0
 def _get_resources(self):
     if LooseVersion(pyspark.__version__) >= LooseVersion('3.0.0'):
         from pyspark import TaskContext
         return TaskContext.get().resources()
     return dict()
Esempio n. 25
0
 def _get_spark_task_context_or_none():
     try:
         from pyspark import TaskContext  # pylint: disable=import-error
         return TaskContext.get()
     except ImportError:
         return None
Esempio n. 26
0
        print("My custom profiles for RDD:%s" % id)


conf = SparkConf().set("spark.python.profile", "true")
sc = SparkContext('local', 'test', conf=conf, profiler_cls=MyCustomProfiler)
sc.parallelize(range(1000)).map(lambda x: 2 * x).take(10)
sc.parallelize(range(1000)).count()
sc.show_profiles()
# My custom profiles for RDD:1
# My custom profiles for RDD:3
sc.stop()

print(
    "-----TaskContext-----RDDBarrier----BarrierTaskContext----BarrierTaskInfo--------------"
)
tc = TaskContext.get()  # 返回当前活动的TaskContext。可以在用户函数内部调用它,以访问有关正在运行的任务的上下文信息。
if tc:
    print(tc.attemptNumber())
    print(tc.getLocalProperty("key"))
    print(tc.partitionId())
    print(tc.resources())
    print(tc.stageId())
    print(tc.taskAttemptId())

# RDDBarrier(实验性的) 用屏障包装RDD以实现屏障执行。 屏障调度器barrier scheduling
# Spark 为了支持深度学习而引入的屏障调度器
b = rdd.barrier()  # b = RDDBarrier(rdd)
# b.mapPartitions()
# b.mapPartitionsWithIndex()

# bt = BarrierTaskContext.get()