def task_info(*_): ctx = TaskContext() return [ "Stage: {0}, Partition: {1}, Host: {2}".format(ctx.stageId(), ctx.partitionId(), socket.gethostname()) ]
def process_spark_partitions(partition): """ :param partition: :type partition: :return: :rtype: """ ctx = TaskContext() logger.info("start_processing_partition partitionId=" + str(ctx.partitionId())) big_taxo = TaxonomyWrapper.get(args, SERVICE_PRINCIPAL_SECRET, logger) gensim_model = GensimMagic.get(args, SERVICE_PRINCIPAL_SECRET, logger) # move this to process_partitions de_vocab = gensim_model["vocab"] # move this to process_partitions de_model = gensim_model["model"] # move this to process_partitions words_list = set(de_vocab.keys()) for domain, domain_dict in big_taxo.items(): words_list = words_list.union(set(domain_dict.keys())) all_records = [] for entry in partition: all_records.extend(process_line_spark(entry, big_taxo, de_model, de_vocab, words_list)) logger.info(f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records") return all_records
def test_partition_id(self): """Test the partition id.""" rdd1 = self.sc.parallelize(range(10), 1) rdd2 = self.sc.parallelize(range(10), 2) pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect() pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect() self.assertEqual(0, pids1[0]) self.assertEqual(0, pids1[9]) self.assertEqual(0, pids2[0]) self.assertEqual(1, pids2[9])
def test_stage_id(self): """Test the stage ids are available and incrementing as expected.""" rdd = self.sc.parallelize(range(10)) stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] # Test using the constructor directly rather than the get() stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0] self.assertEqual(stage1 + 1, stage2) self.assertEqual(stage1 + 2, stage3) self.assertEqual(stage2 + 1, stage3)
def test_get_local_property(self): """Verify that local properties set on the driver are available in TaskContext.""" key = "testkey" value = "testvalue" self.sc.setLocalProperty(key, value) try: rdd = self.sc.parallelize(range(1), 1) prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0] self.assertEqual(prop1, value) prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0] self.assertTrue(prop2 is None) finally: self.sc.setLocalProperty(key, None)
def test_attempt_number(self): """Verify the attempt numbers are correctly reported.""" rdd = self.sc.parallelize(range(10)) # Verify a simple job with no failures attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect() map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers) def fail_on_first(x): """Fail on the first attempt so we get a positive attempt number""" tc = TaskContext.get() attempt_number = tc.attemptNumber() partition_id = tc.partitionId() attempt_id = tc.taskAttemptId() if attempt_number == 0 and partition_id == 0: raise Exception("Failing on first attempt") else: return [x, partition_id, attempt_number, attempt_id] result = rdd.map(fail_on_first).collect() # We should re-submit the first partition to it but other partitions should be attempt 0 self.assertEqual([0, 0, 1], result[0][0:3]) self.assertEqual([9, 3, 0], result[9][0:3]) first_partition = filter(lambda x: x[1] == 0, result) map(lambda x: self.assertEqual(1, x[2]), first_partition) other_partitions = filter(lambda x: x[1] != 0, result) map(lambda x: self.assertEqual(0, x[2]), other_partitions) # The task attempt id should be different self.assertTrue(result[0][3] != result[9][3])
def process_spark_partitions(partition): """ :param partition: :type partition: :return: :rtype: """ ctx = TaskContext() logger.info("start_processing_partitionId=" + str(ctx.partitionId())) all_records = [] for entry in partition: all_records.extend(process_line_spark(entry)) logger.info( f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records" ) return all_records
def fail_on_first(x): """Fail on the first attempt so we get a positive attempt number""" tc = TaskContext.get() attempt_number = tc.attemptNumber() partition_id = tc.partitionId() attempt_id = tc.taskAttemptId() if attempt_number == 0 and partition_id == 0: raise Exception("Failing on first attempt") else: return [x, partition_id, attempt_number, attempt_id]
def save_spark_pandas_to_parquet(output, out_dir): from pyspark import TaskContext ctx = TaskContext() name = f"part_{ctx.partitionId()}" # print("Stage: {0}, Partition: {1}, Host: {2}".format( # ctx.stageId(), ctx.partitionId(), socket.gethostname())) for ds in output.dataset.unique(): df = output[output.dataset == ds] if df.shape[0] == 0: return mkdir(f"{out_dir}/{ds}") path = f"{out_dir}/{ds}/{name}.parquet" df.to_parquet(path=path) print(f"Saved to {path}")
def test_tc_on_driver(self): """Verify that getting the TaskContext on the driver returns None.""" tc = TaskContext.get() self.assertTrue(tc is None)
from pyspark import SparkContext from pyspark import TaskContext if __name__ == '__main__': sc = SparkContext() tc = TaskContext() rdd = sc.parallelize(["这", "是", "一", "首", "简", "单", "的", "小", "情", "歌"], 3) # 与map类似,map是作用于每个元素,而 mapPartitions 是作用于每个分区 # mapPatririons 的函数参数和返回值的类型都应该是 iterator def f(iter): yield "".join(iter) + str(tc.partitionId()) mapPartitions_rdd = rdd.mapPartitions(f) print(mapPartitions_rdd.collect()) # ['这是一0', '首简单1', '的小情歌2']