def test_set_if_missing(self): conf = SparkConf() conf.set(self.RANDOM_KEY, self.RANDOM_VALUE) conf.setIfMissing(self.RANDOM_KEY, self.RANDOM_VALUE2) self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE) conf.setIfMissing(self.RANDOM_KEY2, self.RANDOM_VALUE2) self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
def test_contains(self): conf = SparkConf() conf.setAll( pairs=[(self.RANDOM_KEY, self.RANDOM_VALUE), (self.RANDOM_KEY2, self.RANDOM_VALUE2)] ) self.assertTrue(conf.contains(self.RANDOM_KEY)) self.assertTrue(conf.contains(self.RANDOM_KEY2))
def test_set_all(self): conf = SparkConf() conf.setAll( pairs=[(self.RANDOM_KEY, self.RANDOM_VALUE), (self.RANDOM_KEY2, self.RANDOM_VALUE2)] ) self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE) self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
def test_set_executor_env2(self): conf = SparkConf() conf.setExecutorEnv( key=self.RANDOM_KEY, value=self.RANDOM_VALUE, pairs=[(self.RANDOM_KEY2, self.RANDOM_VALUE2)] ) self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE) self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
def test_named_properties(self): conf = SparkConf() conf.setMaster(self.RANDOM_VALUE) self.assertEquals(conf.get('master'), self.RANDOM_VALUE) conf.setAppName(self.RANDOM_VALUE) self.assertEquals(conf.get('appName'), self.RANDOM_VALUE) conf.setSparkHome(self.RANDOM_VALUE) self.assertEquals(conf.get('sparkHome'), self.RANDOM_VALUE)
def spark_ctx(): """A simple spark context.""" if IF_DUMMY_SPARK: from dummy_spark import SparkConf, SparkContext conf = SparkConf() ctx = SparkContext(master='', conf=conf) else: from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster('local[2]').setAppName('drudge-unittest') ctx = SparkContext(conf=conf) return ctx
def test_word_count_3(self): lines = [ 'apple', 'apple banana', 'apple banana', 'apple banana grape', 'banana grape', 'banana' ] expected_output = [ ('apple', 4), ('banana', 5), ('grape', 2), ] sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize(lines) rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda word: (word, 1)) rdd = rdd.reduceByKey(lambda a, b: a + b) output = rdd.collect() self.assertEquals(sorted(output), sorted(expected_output))
def test_left_outer_join(self): sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', [1, 2, 3]), ('B', [2,3,4])]) rdd2 = sc.parallelize([('A', [1, 2, 3]), ('B', [2,3,4]), ('B', [4,5,6])]) out = rdd1.leftOuterJoin(rdd2).collect() print(out) self.assertEqual(len(out), 2)
def test_combineByKey(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([ ('A', 1), ('B', 2), ('B', 3), ('C', 4), ('C', 5), ('A', 6), ]) def create_combiner(a): return [a] def merge_value(a, b): a.append(b) return a def merge_combiners(a, b): a.extend(b) return a rdd = rdd.combineByKey(create_combiner, merge_value, merge_combiners) self.assertListEqual( rdd.collect(), [('A', [1, 6]), ('B', [2, 3]), ('C', [4, 5])], )
def test_minion_perform_deliver_success(): workflow_id = '6666' app_id = '1000' job_id = '1' out_queue = 'queue_2000' sconf = SparkConf() sc = SparkContext(master='', conf=sconf) rdd = sc.parallelize(get_records()) df0 = DataFrame(rdd=rdd) with mock.patch('redis.StrictRedis', mock_strict_redis_client) as mocked_redis: redis_conn = mocked_redis() state_control = StateControlRedis(redis_conn) data = { 'workflow_id': workflow_id, 'app_id': app_id, 'job_id': job_id, 'type': 'deliver', 'task_id': '033f-284ab-28987e', 'port': 'port0', 'output': out_queue, 'workflow': '' } state_control.push_app_queue(app_id, json.dumps(data)) minion = SparkMinion(redis_conn=redis_conn, workflow_id=workflow_id, app_id=app_id, config=config) minion._emit_event = dummy_emit_event minion._state = { data['task_id']: { 'port0': { 'output': df0, 'sample': [] }, 'time': 35.92 } } minion._process_message() # Discard first status message state_control.pop_app_output_queue(app_id, False) msg = json.loads(state_control.pop_app_output_queue(app_id, False)) assert msg['status'] == 'SUCCESS', 'Invalid status' assert msg['code'] == minion.MNN002[0], 'Invalid code' # CSV data csv_records = '\n'.join( map(dataframe_util.convert_to_csv, get_records())) result = json.loads(state_control.pop_queue(out_queue, False)) assert result['sample'] == csv_records, 'Wrong CSV generated'
def test_set_executor_env2(self): conf = SparkConf() conf.setExecutorEnv(key=self.RANDOM_KEY, value=self.RANDOM_VALUE, pairs=[(self.RANDOM_KEY2, self.RANDOM_VALUE2)]) self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE) self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
def test_set_all(self): conf = SparkConf() conf.setAll( pairs=[(self.RANDOM_KEY, self.RANDOM_VALUE), (self.RANDOM_KEY2, self.RANDOM_VALUE2)]) self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE) self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
def test_contains(self): conf = SparkConf() conf.setAll( pairs=[(self.RANDOM_KEY, self.RANDOM_VALUE), (self.RANDOM_KEY2, self.RANDOM_VALUE2)]) self.assertTrue(conf.contains(self.RANDOM_KEY)) self.assertTrue(conf.contains(self.RANDOM_KEY2))
def test_sortByKey_descending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([ ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ]).sortByKey(ascending=False)) self.assertListEqual( rdd.collect(), [ ('e', 5), ('d', 4), ('c', 3), ('b', 2), ('a', 1), ], )
# -*- coding: utf-8 -*- import os import random from dummy_spark import SparkContext, SparkConf from dummy_spark.sql import SQLContext __author__ = 'willmcginnis' # make a spark conf sconf = SparkConf() # set some property (won't do anything) sconf.set('spark.executor.extraClassPath', 'foo') # use the spark conf to make a spark context sc = SparkContext(master='', conf=sconf) # set the log level (also doesn't do anything) sc.setLogLevel('INFO') # maybe make a useless sqlcontext (nothing implimented here yet) sqlctx = SQLContext(sc) # add pyfile just appends to the sys path sc.addPyFile(os.path.dirname(__file__)) # do some hadoop configuration into the ether sc._jsc.hadoopConfiguration().set('foo', 'bar')
class RDDTests(unittest.TestCase): SPARK_CONTEXT = SparkContext(master='', conf=SparkConf()) TEST_RANGES = [ (0, 0, 1), (0, 10, 1), (0, 10, 2), (0, 100, 13), (0, 1000, 17), (0, 10000, 31), ] SAMPLE_FRACTION = 0.10 SAMPLE_SEED = 1234 def test_init(self): for start, stop, step in self.TEST_RANGES: l = list(range(start, stop, step)) rdd = RDD(l, self.SPARK_CONTEXT) self.assertEquals(l, rdd.collect()) s = set(range(100)) rdd = RDD(s, self.SPARK_CONTEXT) self.assertEquals(sorted(list(s)), sorted(rdd.collect())) t = (1, 2, 3) with self.assertRaises(AttributeError): RDD(t, self.SPARK_CONTEXT) with self.assertRaises(AttributeError): RDD('', self.SPARK_CONTEXT) def test_ctx(self): rdd = RDD([], self.SPARK_CONTEXT) self.assertEquals(rdd.ctx, self.SPARK_CONTEXT) @staticmethod def square(x): return x**2 def test_map(self): for start, stop, step in self.TEST_RANGES: l1 = range(start, stop, step) l2 = map(RDDTests.square, l1) rdd = RDD(list(l1), self.SPARK_CONTEXT) rdd = rdd.map(RDDTests.square) self.assertEquals(rdd.collect(), list(l2)) @staticmethod def triplicate(x): return [x, x, x] def test_flat_map(self): for start, stop, step in self.TEST_RANGES: l1 = range(start, stop, step) l2 = map(RDDTests.triplicate, l1) l3 = [] for sl in l2: l3.extend(sl) rdd = RDD(list(l1), self.SPARK_CONTEXT) rdd = rdd.flatMap(RDDTests.triplicate) self.assertEquals(rdd.collect(), list(l3)) @staticmethod def is_square(x): return x == x**2 def test_filter(self): for start, stop, step in self.TEST_RANGES: l1 = range(start, stop, step) l2 = filter(RDDTests.is_square, l1) rdd = RDD(list(l1), self.SPARK_CONTEXT) rdd = rdd.filter(RDDTests.is_square) self.assertEquals(rdd.collect(), list(l2)) @staticmethod def return_one(x): return x - x + 1 def test_distinct(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) rdd = rdd.map(RDDTests.return_one) rdd = rdd.distinct() if len(l) > 0: self.assertEquals(rdd.collect(), [1]) else: self.assertEquals(rdd.collect(), []) def test_sample_with_replacement(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) sample = rdd.sample(True, self.SAMPLE_FRACTION).collect() self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION)) for item in sample: self.assertTrue(item in l) def test_sample_with_replacement_with_seed(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) sample1 = rdd.sample(True, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect() sample2 = rdd.sample(True, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect() self.assertEquals(sorted(sample1), sorted(sample2)) sample = sample1 self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION)) for item in sample: self.assertTrue(item in l) def test_sample_without_replacement(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) sample = rdd.sample(False, self.SAMPLE_FRACTION).collect() self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION)) self.assertEquals(sorted(l), sorted(set(l))) for item in sample: self.assertTrue(item in l) def test_sample_without_replacement_with_seed(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) sample1 = rdd.sample(False, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect() sample2 = rdd.sample(False, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect() self.assertEquals(sorted(sample1), sorted(sample2)) sample = sample1 self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION)) self.assertEquals(sorted(l), sorted(set(l))) for item in sample: self.assertTrue(item in l) def test_union(self): for start1, stop1, step1 in self.TEST_RANGES: for start2, stop2, step2 in self.TEST_RANGES: l1 = range(start1, stop1, step1) l2 = range(start2, stop2, step2) rdd1 = RDD(list(l1), self.SPARK_CONTEXT) rdd2 = RDD(list(l2), self.SPARK_CONTEXT) rdd = rdd1.union(rdd2) self.assertEquals(sorted(rdd.collect()), sorted(list(l1) + list(l2))) def test_intersection(self): for start1, stop1, step1 in self.TEST_RANGES: for start2, stop2, step2 in self.TEST_RANGES: l1 = range(start1, stop1, step1) l2 = range(start2, stop2, step2) rdd1 = RDD(list(l1), self.SPARK_CONTEXT) rdd2 = RDD(list(l2), self.SPARK_CONTEXT) rdd = rdd1.intersection(rdd2) self.assertEquals(sorted(rdd.collect()), sorted([x for x in l1 if x in l2])) def test_group_by_key(self): l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] rdd = RDD(l, self.SPARK_CONTEXT) rdd = rdd.groupByKey() r = rdd.collect() r = [(kv[0], list(kv[1])) for kv in r] self.assertEquals(sorted(r), sorted([(1, [1]), (2, [1, 2]), (3, [1, 2, 3])])) def test_reduce_by_key(self): l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] rdd = RDD(l, self.SPARK_CONTEXT) rdd = rdd.reduceByKey(lambda a, b: a + b) print(rdd) self.assertEquals(sorted(rdd.collect()), sorted([(1, 1), (2, 3), (3, 6)])) def test_cartesian(self): for start1, stop1, step1 in self.TEST_RANGES: for start2, stop2, step2 in self.TEST_RANGES: l1 = range(start1, stop1, step1) l2 = range(start2, stop2, step2) rdd1 = RDD(list(l1), self.SPARK_CONTEXT) rdd2 = RDD(list(l2), self.SPARK_CONTEXT) rdd = rdd1.cartesian(rdd2) r = rdd.collect() self.assertEquals(len(r), len(l1) * len(l2)) for t, u in r: self.assertTrue(t in l1) self.assertTrue(u in l2) def test_cogroup(self): l1 = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] l2 = [(2, 10), (2, 20), (3, 10), (3, 20), (3, 30), (4, 40)] rdd1 = RDD(l1, self.SPARK_CONTEXT) rdd2 = RDD(l2, self.SPARK_CONTEXT) rdd = rdd1.cogroup(rdd2) l = rdd.collect() self.assertEquals( sorted(l), sorted([(1, [1], []), (2, [1, 2], [10, 20]), (3, [1, 2, 3], [10, 20, 30]), (4, [], [40])])) def test_word_count_1(self): lines = [ 'grape banana apple', ] expected_output = [ ('apple', 1), ('banana', 1), ('grape', 1), ] sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize(lines) rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda word: (word, 1)) rdd = rdd.reduceByKey(lambda a, b: a + b) output = rdd.collect() self.assertEquals(sorted(output), sorted(expected_output)) def test_word_count_2(self): lines = [ 'apple', 'apple banana', 'apple banana', 'apple banana grape', ] expected_output = [ ('apple', 4), ('banana', 3), ('grape', 1), ] sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize(lines) rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda word: (word, 1)) rdd = rdd.reduceByKey(lambda a, b: a + b) output = rdd.collect() self.assertEquals(sorted(output), sorted(expected_output)) def test_word_count_3(self): lines = [ 'apple', 'apple banana', 'apple banana', 'apple banana grape', 'banana grape', 'banana' ] expected_output = [ ('apple', 4), ('banana', 5), ('grape', 2), ] sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize(lines) rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda word: (word, 1)) rdd = rdd.reduceByKey(lambda a, b: a + b) output = rdd.collect() self.assertEquals(sorted(output), sorted(expected_output)) def test_left_outer_join(self): sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', [1, 2, 3]), ('B', [2, 3, 4])]) rdd2 = sc.parallelize([('A', [1, 2, 3]), ('B', [2, 3, 4]), ('B', [4, 5, 6])]) out = rdd1.leftOuterJoin(rdd2).collect() print(out) self.assertEqual(len(out), 2) def test_keys(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) self.assertListEqual(rdd.keys().collect(), ['A', 'B', 'C']) def test_values(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) self.assertListEqual(rdd.values().collect(), [1, 2, 3]) def test_combineByKey(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([ ('A', 1), ('B', 2), ('B', 3), ('C', 4), ('C', 5), ('A', 6), ]) def create_combiner(a): return [a] def merge_value(a, b): a.append(b) return a def merge_combiners(a, b): a.extend(b) return a rdd = rdd.combineByKey(create_combiner, merge_value, merge_combiners) self.assertListEqual( rdd.collect(), [('A', [1, 6]), ('B', [2, 3]), ('C', [4, 5])], ) def test_sortByKey_ascending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([ ('e', 5), ('d', 4), ('c', 3), ('b', 2), ('a', 1), ]).sortByKey(ascending=True)) self.assertListEqual( rdd.collect(), [ ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ], ) def test_sortByKey_descending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([ ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ]).sortByKey(ascending=False)) self.assertListEqual( rdd.collect(), [ ('e', 5), ('d', 4), ('c', 3), ('b', 2), ('a', 1), ], ) def test_sortBy_ascending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([5, 4, 3, 2, 1]).sortBy(lambda x: x, ascending=True)) self.assertListEqual(rdd.collect(), [1, 2, 3, 4, 5]) def test_sortBy_descending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([1, 2, 3, 4, 5]).sortBy(lambda x: x, ascending=False)) self.assertListEqual(rdd.collect(), [5, 4, 3, 2, 1]) def test_subtractByKey(self): sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) rdd2 = sc.parallelize([('A', None), ('C', None)]) self.assertListEqual(rdd1.subtractByKey(rdd2).collect(), [('B', 2)]) def test_not_implemented_methods(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([]) with self.assertRaises(NotImplementedError): rdd._pickled() with self.assertRaises(NotImplementedError): rdd.mapPartitionsWithIndex( None, None, ) with self.assertRaises(NotImplementedError): rdd._computeFractionForSampleSize(None, None, None) with self.assertRaises(NotImplementedError): rdd.pipe(None, None) with self.assertRaises(NotImplementedError): rdd.reduce(None) with self.assertRaises(NotImplementedError): rdd.treeReduce(None, None) with self.assertRaises(NotImplementedError): rdd.fold( None, None, ) with self.assertRaises(NotImplementedError): rdd.aggregate(None, None, None) with self.assertRaises(NotImplementedError): rdd.treeAggregate(None, None, None, None) with self.assertRaises(NotImplementedError): rdd.stats() with self.assertRaises(NotImplementedError): rdd.histogram(None) with self.assertRaises(NotImplementedError): rdd.variance() with self.assertRaises(NotImplementedError): rdd.stdev() with self.assertRaises(NotImplementedError): rdd.sampleStdev() with self.assertRaises(NotImplementedError): rdd.sampleVariance() with self.assertRaises(NotImplementedError): rdd.countByValue() with self.assertRaises(NotImplementedError): rdd.top(None, None) with self.assertRaises(NotImplementedError): rdd.takeOrdered(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopFile(None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopFile(None, None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsSequenceFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsPickleFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsTextFile(None, None) with self.assertRaises(NotImplementedError): rdd.collectAsMap() with self.assertRaises(NotImplementedError): rdd.reduceByKeyLocally(None) with self.assertRaises(NotImplementedError): rdd.countByKey() with self.assertRaises(NotImplementedError): rdd.join(None, None) with self.assertRaises(NotImplementedError): rdd.rightOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.fullOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.foldByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd._can_spill() with self.assertRaises(NotImplementedError): rdd._memory_limit() with self.assertRaises(NotImplementedError): rdd.groupWith(None, None) with self.assertRaises(NotImplementedError): rdd.sampleByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd.subtract(None, None) with self.assertRaises(NotImplementedError): rdd.coalesce(None, None) with self.assertRaises(NotImplementedError): rdd.toDebugString() with self.assertRaises(NotImplementedError): rdd.getStorageLevel() with self.assertRaises(NotImplementedError): rdd._to_java_object_rdd()
def test_not_implemented_methods(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([]) with self.assertRaises(NotImplementedError): rdd._pickled() with self.assertRaises(NotImplementedError): rdd.mapPartitionsWithIndex( None, None, ) with self.assertRaises(NotImplementedError): rdd._computeFractionForSampleSize(None, None, None) with self.assertRaises(NotImplementedError): rdd.pipe(None, None) with self.assertRaises(NotImplementedError): rdd.reduce(None) with self.assertRaises(NotImplementedError): rdd.treeReduce(None, None) with self.assertRaises(NotImplementedError): rdd.fold( None, None, ) with self.assertRaises(NotImplementedError): rdd.aggregate(None, None, None) with self.assertRaises(NotImplementedError): rdd.treeAggregate(None, None, None, None) with self.assertRaises(NotImplementedError): rdd.stats() with self.assertRaises(NotImplementedError): rdd.histogram(None) with self.assertRaises(NotImplementedError): rdd.variance() with self.assertRaises(NotImplementedError): rdd.stdev() with self.assertRaises(NotImplementedError): rdd.sampleStdev() with self.assertRaises(NotImplementedError): rdd.sampleVariance() with self.assertRaises(NotImplementedError): rdd.countByValue() with self.assertRaises(NotImplementedError): rdd.top(None, None) with self.assertRaises(NotImplementedError): rdd.takeOrdered(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopFile(None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopFile(None, None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsSequenceFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsPickleFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsTextFile(None, None) with self.assertRaises(NotImplementedError): rdd.collectAsMap() with self.assertRaises(NotImplementedError): rdd.reduceByKeyLocally(None) with self.assertRaises(NotImplementedError): rdd.countByKey() with self.assertRaises(NotImplementedError): rdd.join(None, None) with self.assertRaises(NotImplementedError): rdd.rightOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.fullOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.foldByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd._can_spill() with self.assertRaises(NotImplementedError): rdd._memory_limit() with self.assertRaises(NotImplementedError): rdd.groupWith(None, None) with self.assertRaises(NotImplementedError): rdd.sampleByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd.subtract(None, None) with self.assertRaises(NotImplementedError): rdd.coalesce(None, None) with self.assertRaises(NotImplementedError): rdd.toDebugString() with self.assertRaises(NotImplementedError): rdd.getStorageLevel() with self.assertRaises(NotImplementedError): rdd._to_java_object_rdd()
def test_subtractByKey(self): sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) rdd2 = sc.parallelize([('A', None), ('C', None)]) self.assertListEqual(rdd1.subtractByKey(rdd2).collect(), [('B', 2)])
def test_sortBy_descending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([1, 2, 3, 4, 5]).sortBy(lambda x: x, ascending=False)) self.assertListEqual(rdd.collect(), [5, 4, 3, 2, 1])
def test_get_all(self): conf = SparkConf() pairs = [(self.RANDOM_KEY, self.RANDOM_VALUE), (self.RANDOM_KEY2, self.RANDOM_VALUE2)] conf.setAll(pairs) self.assertEquals(sorted(conf.getAll()), sorted(pairs))
def test_to_debug_string(self): conf = SparkConf() self.assertEquals(conf.toDebugString(), SparkConf.DEBUG_STRING)
def test_values(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) self.assertListEqual(rdd.values().collect(), [1, 2, 3])
def test_subtractByKey(self): """values method returns the values as expected.""" sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) rdd2 = sc.parallelize([('A', None), ('C', None)]) self.assertListEqual(rdd1.subtractByKey(rdd2).collect(), [('B', 2)])