def test_multiprocessing(): p = multiprocessing.Pool(4) c = Context(pool=p, serializer=dill.dumps, deserializer=dill.loads) my_rdd = c.parallelize([1, 3, 4]) r = my_rdd.map(lambda x: x*x).collect() print(r) assert 16 in r
def test_first_mp(): p = multiprocessing.Pool(4) c = Context(pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads) my_rdd = c.parallelize([1, 2, 2, 4, 1, 3, 5, 9], 3) print(my_rdd.first()) assert my_rdd.first() == 1
def test_multiprocessing(): p = multiprocessing.Pool(4) c = Context(pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads) my_rdd = c.parallelize([1, 3, 4]) r = my_rdd.map(lambda x: x*x).collect() print(r) assert 16 in r
def run_feature_extraction(): start_time = time.time() desc = 'Feature Extraction for Images' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawDescriptionHelpFormatter, epilog=desc) default_path = '/media/chris/cschulze_external_4tb/receipt_classifier_images/nonreceipts/train2014' # default_path = '/media/chris/cschulze_external_4tb/elliot_data/train_nonpill' # default_path = '/train_nonpill' parser.add_argument("--input_dir", help="input directory", default=default_path) parser.add_argument("--output", help="output file", default='image_features') args = parser.parse_args() # serialize and put all images in rdd: # use json schema: # "image_name": "", # "bytes": "" # "features": "array[]" image_dir_path = args.input_dir df, data_arr = serialize_and_make_df(image_dir_path) print df.head() print df.info() # df to df_cvs: csv_df_file = 'dataframe_csv_file.csv' json_df_file = 'dataframe_csv_file.json' df.to_csv(csv_df_file, header=False, index=False) # df.to_json(json_df_file) # rdd from df_csv # pysparkling: sc = Context() # pyspark: # conf = SparkConf().setAppName("HOG and GIST ETL") # sc = SparkContext(conf=conf) # rdd = sc.textFile(json_df_file) num_parts = 4 rdd = sc.parallelize(data_arr, num_parts) # submit image rdd to processing rdd_features = rdd.map(get_features).coalesce(1) # save as txt file: rdd_features.map(dump).saveAsTextFile(args.output) print "------------------ %f minutes elapsed ------------------------" % ( (time.time() - start_time) / 60.0)
class RDDTest(unittest.TestCase): """Tests for the resilient distributed databases""" def setUp(self): self.context = Context() def testLeftOuterJoinSimple(self): """Test the basic left outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) zx = sorted(z.leftOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc'))]) self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))]) def testLeftOuterJoinDuplicate(self): """Test the left outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2'))]) def testRightOuterJoinSimple(self): """Test the basic right outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) zx = sorted(z.rightOuterJoin(x).collect()) self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc'))]) def testRightOuterJoinDuplicate(self): """Test the right outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def testFullOuterJoinSimple(self): """Test the basic full outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) zx = sorted(z.fullOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc')), ('d', ('zd', None))]) def testFullOuterJoinDuplicate(self): """Test the full outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def test_cartesian(self): x = self.context.parallelize(range(0, 2)) y = self.context.parallelize(range(3, 6)) c = x.cartesian(y) result = sorted(c.collect()) expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)]) self.assertListEqual(result, expected) def test_sample(self): rdd = self.context.parallelize(range(100), 4) self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14) def test_sampleByKey(self): fractions = {"a": 0.2, "b": 0.1} range_rdd = self.context.parallelize(range(0, 1000)) rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd) sample = dict( rdd.sampleByKey(False, fractions, 2).groupByKey().collect()) self.assertTrue(100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150) self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0) self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0) def test_groupByKey(self): # This will fail if the values of the RDD need to be compared class IncomparableValue: def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValue(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.groupByKey().collect() expected_group = ((0, r[0::3]), (1, r[1::3]), (2, r[2::3])) grouped_dict = dict(actual_group) for k, v in expected_group: self.assertIn(k, grouped_dict) for vv in v: self.assertIn(vv, grouped_dict[k]) def test_reduceByKey(self): # This will fail if the values of the RDD need to be compared class IncomparableValueAddable: def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __add__(self, other): return self.__class__(self.value + other.value) def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValueAddable(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.reduceByKey(add).collect() expected_group = ((0, IncomparableValueAddable(3)), (1, IncomparableValueAddable(5)), (2, IncomparableValueAddable(7))) grouped_dict = dict(actual_group) # Keep this order-agnostic for k, v in expected_group: self.assertEqual(grouped_dict[k], v) def test_reduceByKey_with_numPartition(self): # This will fail if the values of the RDD need to be compared class IncomparableValueAddable: def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __add__(self, other): return self.__class__(self.value + other.value) def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValueAddable(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.reduceByKey(add, numPartitions=20).collect() expected_group = ((0, IncomparableValueAddable(3)), (1, IncomparableValueAddable(5)), (2, IncomparableValueAddable(7))) grouped_dict = dict(actual_group) # Keep this order-agnostic for k, v in expected_group: self.assertEqual(grouped_dict[k], v)
class RDDTest(unittest.TestCase): """Tests for the resilient distributed databases""" def setUp(self): self.context = Context() def testLeftOuterJoinSimple(self): """Test the basic left outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) zx = sorted(z.leftOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc'))]) self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))]) def testLeftOuterJoinDuplicate(self): """Test the left outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2'))]) def testRightOuterJoinSimple(self): """Test the basic right outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) zx = sorted(z.rightOuterJoin(x).collect()) self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc'))]) def testRightOuterJoinDuplicate(self): """Test the right outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def testFullOuterJoinSimple(self): """Test the basic full outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) zx = sorted(z.fullOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc')), ('d', ('zd', None))]) def testFullOuterJoinDuplicate(self): """Test the full outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def test_cartesian(self): x = self.context.parallelize(range(0, 2)) y = self.context.parallelize(range(3, 6)) c = x.cartesian(y) result = sorted(c.collect()) expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)]) self.assertListEqual(result, expected) def test_sample(self): rdd = self.context.parallelize(range(100), 4) self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14) def test_sampleByKey(self): fractions = {"a": 0.2, "b": 0.1} range_rdd = self.context.parallelize(range(0, 1000)) rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd) sample = dict( rdd.sampleByKey(False, fractions, 2).groupByKey().collect() ) self.assertTrue(100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150) self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0) self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0) def test_groupByKey(self): # This will fail if the values of the RDD need to be compared class IncomparableValue(object): def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValue(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.groupByKey().collect() expected_group = ((0, r[0::3]), (1, r[1::3]), (2, r[2::3])) grouped_dict = {k: v for k, v in actual_group} for k, v in expected_group: self.assertIn(k, grouped_dict) for vv in v: self.assertIn(vv, grouped_dict[k]) def test_reduceByKey(self): # This will fail if the values of the RDD need to be compared class IncomparableValueAddable(object): def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __add__(self, other): return self.__class__(self.value + other.value) def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValueAddable(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.reduceByKey(add).collect() expected_group = ((0, IncomparableValueAddable(3)), (1, IncomparableValueAddable(5)), (2, IncomparableValueAddable(7))) grouped_dict = {k: v for k, v in actual_group} # Keep this order-agnostic for k, v in expected_group: self.assertEqual(grouped_dict[k], v)
def test_splitting(test_case, text, expected): sc = Context() print("Test case:", test_case) ds = sc.parallelize([text]) result = ds.flatMap(split_words).collect() assert result == expected
def test_case_insensitivity(): text = "a a A A" sc = Context() ds = sc.parallelize([text]) result = wordcount(ds).collect() assert result == [('a', 4)]
def test_ordering_words(): text = "a b b c c c d d d d" sc = Context() ds = sc.parallelize([text]) result = wordcount(ds).collect() assert result == [('d', 4), ('c', 3), ('b', 2), ('a', 1)]
class RDDTest(unittest.TestCase): """ Tests for the resilient distributed databases """ def setUp(self): self.context = Context() def testLeftOuterJoinSimple(self): """ Test the basic left outer join with simple key-value pairs """ x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) zx = sorted(z.leftOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc'))]) self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))]) @unittest.skip("Known failure") def testLeftOuterJoinDuplicate(self): """ Test the left outer join with duplicate keys """ x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2'))]) def testRightOuterJoinSimple(self): """ Test the basic right outer join with simple key-value pairs """ x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) zx = sorted(z.rightOuterJoin(x).collect()) self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc'))]) @unittest.skip("Known failure") def testRightOuterJoinDuplicate(self): """ Test the right outer join with duplicate keys """ x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def testFullOuterJoinSimple(self): """ Test the basic full outer join with simple key-value pairs """ x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) zx = sorted(z.fullOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc')), ('d', ('zd', None))]) @unittest.skip("Known failure") def testFullOuterJoinDuplicate(self): """ Test the full outer join with duplicate keys """ x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))])
currentUser = i.username if(count ==5): break print('#### quantidade de lang=pt por hashtag #####') def inserirByTag(x): print(x) session.execute("insert into resumebytag (uuid, hashtag, count) values (%s, %s, %s)", (random.randrange(10000, 30000), x[0], x[1])) testeRDD = filter(lambda x: x[3] == 'pt', mostFollowersRDD) teste2RDD = map(lambda x: (x[2], 1), testeRDD) sc = Context() teste3RDD = sc.parallelize(teste2RDD) teste4RDD = teste3RDD.reduceByKey(lambda accum, n: accum + n) teste4RDD.foreach(inserirByTag) print('#### total de postagens/hora do dia #####') def inserirByDayHour(x): print(x) session.execute("insert into resumebydayhour (uuid, dayhour, count) values (%s, %s, %s)", (random.randrange(10000, 30000), x[0], x[1])) teste5RDD = sc.parallelize(mostFollowersRDD) def agruparDate(x): diaHora = '{:%Y-%m-%d %H}'.format(x[1])
def test_union(): sc = Context() rdd1 = sc.parallelize(["Hello"]) rdd2 = sc.parallelize(["World"]) union = sc.union([rdd1, rdd2]).collect() assert len(union) == 2 and "Hello" in union and "World" in union
class RDDTest(unittest.TestCase): """ Tests for the resilient distributed databases """ def setUp(self): self.context = Context() def testLeftOuterJoinSimple(self): """ Test the basic left outer join with simple key-value pairs """ x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) zx = sorted(z.leftOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc'))]) self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))]) def testLeftOuterJoinDuplicate(self): """ Test the left outer join with duplicate keys """ x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2'))]) def testRightOuterJoinSimple(self): """ Test the basic right outer join with simple key-value pairs """ x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) zx = sorted(z.rightOuterJoin(x).collect()) self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc'))]) def testRightOuterJoinDuplicate(self): """ Test the right outer join with duplicate keys """ x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def testFullOuterJoinSimple(self): """ Test the basic full outer join with simple key-value pairs """ x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) zx = sorted(z.fullOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc')), ('d', ('zd', None))]) def testFullOuterJoinDuplicate(self): """ Test the full outer join with duplicate keys """ x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def test_cartesian(self): x = self.context.parallelize(range(0, 2)) y = self.context.parallelize(range(3, 6)) c = x.cartesian(y) result = sorted(c.collect()) expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)]) self.assertListEqual(result, expected) def test_sample(self): rdd = self.context.parallelize(range(100), 4) self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14) def test_sampleByKey(self): fractions = {"a": 0.2, "b": 0.1} range_rdd = self.context.parallelize(range(0, 1000)) rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd) sample = dict( rdd.sampleByKey(False, fractions, 2).groupByKey().collect() ) self.assertTrue(100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150) self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0) self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0)
def test_union(): sc = Context() rdd1 = sc.parallelize(['Hello']) rdd2 = sc.parallelize(['World']) union = sc.union([rdd1, rdd2]).collect() assert len(union) == 2 and 'Hello' in union and 'World' in union