def test_multiprocessing():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=dill.dumps, deserializer=dill.loads)
    my_rdd = c.parallelize([1, 3, 4])
    r = my_rdd.map(lambda x: x*x).collect()
    print(r)
    assert 16 in r
def test_first_mp():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=cloudpickle.dumps,
                deserializer=pickle.loads)
    my_rdd = c.parallelize([1, 2, 2, 4, 1, 3, 5, 9], 3)
    print(my_rdd.first())
    assert my_rdd.first() == 1
def test_multiprocessing():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=cloudpickle.dumps,
                deserializer=pickle.loads)
    my_rdd = c.parallelize([1, 3, 4])
    r = my_rdd.map(lambda x: x*x).collect()
    print(r)
    assert 16 in r
Example #4
0
def run_feature_extraction():
    start_time = time.time()
    desc = 'Feature Extraction for Images'
    parser = argparse.ArgumentParser(
        description=desc,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=desc)
    default_path = '/media/chris/cschulze_external_4tb/receipt_classifier_images/nonreceipts/train2014'
    # default_path = '/media/chris/cschulze_external_4tb/elliot_data/train_nonpill'
    # default_path = '/train_nonpill'

    parser.add_argument("--input_dir",
                        help="input directory",
                        default=default_path)
    parser.add_argument("--output",
                        help="output file",
                        default='image_features')
    args = parser.parse_args()
    # serialize and put all images in rdd:
    # use json schema:
    #     "image_name": "",
    #     "bytes": ""
    #     "features": "array[]"
    image_dir_path = args.input_dir
    df, data_arr = serialize_and_make_df(image_dir_path)
    print df.head()
    print df.info()

    # df to df_cvs:
    csv_df_file = 'dataframe_csv_file.csv'
    json_df_file = 'dataframe_csv_file.json'
    df.to_csv(csv_df_file, header=False, index=False)
    # df.to_json(json_df_file)

    # rdd from df_csv
    # pysparkling:
    sc = Context()

    # pyspark:
    # conf = SparkConf().setAppName("HOG and GIST ETL")
    # sc = SparkContext(conf=conf)

    # rdd = sc.textFile(json_df_file)
    num_parts = 4
    rdd = sc.parallelize(data_arr, num_parts)
    # submit image rdd to processing
    rdd_features = rdd.map(get_features).coalesce(1)
    # save as txt file:
    rdd_features.map(dump).saveAsTextFile(args.output)
    print "------------------ %f minutes elapsed ------------------------" % (
        (time.time() - start_time) / 60.0)
Example #5
0
class RDDTest(unittest.TestCase):
    """Tests for the resilient distributed databases"""
    def setUp(self):
        self.context = Context()

    def testLeftOuterJoinSimple(self):
        """Test the basic left outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())
        zx = sorted(z.leftOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)),
                              ('c', ('xc', 'zc'))])

        self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))])

    def testLeftOuterJoinDuplicate(self):
        """Test the left outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2'))])

    def testRightOuterJoinSimple(self):
        """Test the basic right outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())
        zx = sorted(z.rightOuterJoin(x).collect())

        self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')),
                              ('c', ('zc', 'xc'))])

    def testRightOuterJoinDuplicate(self):
        """Test the right outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())

        self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def testFullOuterJoinSimple(self):
        """Test the basic full outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())
        zx = sorted(z.fullOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)),
                              ('c', ('xc', 'zc')), ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')),
                              ('c', ('zc', 'xc')), ('d', ('zd', None))])

    def testFullOuterJoinDuplicate(self):
        """Test the full outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))])

    def test_cartesian(self):
        x = self.context.parallelize(range(0, 2))
        y = self.context.parallelize(range(3, 6))
        c = x.cartesian(y)
        result = sorted(c.collect())
        expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)])
        self.assertListEqual(result, expected)

    def test_sample(self):
        rdd = self.context.parallelize(range(100), 4)
        self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14)

    def test_sampleByKey(self):
        fractions = {"a": 0.2, "b": 0.1}
        range_rdd = self.context.parallelize(range(0, 1000))
        rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd)
        sample = dict(
            rdd.sampleByKey(False, fractions, 2).groupByKey().collect())
        self.assertTrue(100 < len(sample["a"]) < 300
                        and 50 < len(sample["b"]) < 150)
        self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0)
        self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0)

    def test_groupByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValue:
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValue(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.groupByKey().collect()

        expected_group = ((0, r[0::3]), (1, r[1::3]), (2, r[2::3]))

        grouped_dict = dict(actual_group)

        for k, v in expected_group:
            self.assertIn(k, grouped_dict)

            for vv in v:
                self.assertIn(vv, grouped_dict[k])

    def test_reduceByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValueAddable:
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __add__(self, other):
                return self.__class__(self.value + other.value)

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValueAddable(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.reduceByKey(add).collect()

        expected_group = ((0, IncomparableValueAddable(3)),
                          (1, IncomparableValueAddable(5)),
                          (2, IncomparableValueAddable(7)))

        grouped_dict = dict(actual_group)

        # Keep this order-agnostic
        for k, v in expected_group:
            self.assertEqual(grouped_dict[k], v)

    def test_reduceByKey_with_numPartition(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValueAddable:
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __add__(self, other):
                return self.__class__(self.value + other.value)

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValueAddable(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.reduceByKey(add, numPartitions=20).collect()

        expected_group = ((0, IncomparableValueAddable(3)),
                          (1, IncomparableValueAddable(5)),
                          (2, IncomparableValueAddable(7)))

        grouped_dict = dict(actual_group)

        # Keep this order-agnostic
        for k, v in expected_group:
            self.assertEqual(grouped_dict[k], v)
Example #6
0
class RDDTest(unittest.TestCase):
    """Tests for the resilient distributed databases"""

    def setUp(self):
        self.context = Context()

    def testLeftOuterJoinSimple(self):
        """Test the basic left outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())
        zx = sorted(z.leftOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)),
                              ('b', ('xb', None)),
                              ('c', ('xc', 'zc'))])

        self.assertEqual(zx, [('c', ('zc', 'xc')),
                              ('d', ('zd', None))])

    def testLeftOuterJoinDuplicate(self):
        """Test the left outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)),
                              ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2'))])

    def testRightOuterJoinSimple(self):
        """Test the basic right outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())
        zx = sorted(z.rightOuterJoin(x).collect())

        self.assertEqual(xy, [('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('c', ('xc', 'zc')),
                              ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')),
                              ('b', (None, 'xb')),
                              ('c', ('zc', 'xc'))])

    def testRightOuterJoinDuplicate(self):
        """Test the right outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())

        self.assertEqual(xy, [('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def testFullOuterJoinSimple(self):
        """Test the basic full outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())
        zx = sorted(z.fullOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)),
                              ('b', ('xb', None)),
                              ('c', ('xc', 'zc')),
                              ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')),
                              ('b', (None, 'xb')),
                              ('c', ('zc', 'xc')),
                              ('d', ('zd', None))])

    def testFullOuterJoinDuplicate(self):
        """Test the full outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)),
                              ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def test_cartesian(self):
        x = self.context.parallelize(range(0, 2))
        y = self.context.parallelize(range(3, 6))
        c = x.cartesian(y)
        result = sorted(c.collect())
        expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)])
        self.assertListEqual(result, expected)

    def test_sample(self):
        rdd = self.context.parallelize(range(100), 4)
        self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14)

    def test_sampleByKey(self):
        fractions = {"a": 0.2, "b": 0.1}
        range_rdd = self.context.parallelize(range(0, 1000))
        rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd)
        sample = dict(
            rdd.sampleByKey(False, fractions, 2).groupByKey().collect()
        )
        self.assertTrue(100 < len(sample["a"]) < 300 and
                        50 < len(sample["b"]) < 150)
        self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0)
        self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0)

    def test_groupByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValue(object):
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValue(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.groupByKey().collect()

        expected_group = ((0, r[0::3]),
                          (1, r[1::3]),
                          (2, r[2::3]))

        grouped_dict = {k: v for k, v in actual_group}

        for k, v in expected_group:
            self.assertIn(k, grouped_dict)

            for vv in v:
                self.assertIn(vv, grouped_dict[k])

    def test_reduceByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValueAddable(object):
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __add__(self, other):
                return self.__class__(self.value + other.value)

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValueAddable(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.reduceByKey(add).collect()

        expected_group = ((0, IncomparableValueAddable(3)),
                          (1, IncomparableValueAddable(5)),
                          (2, IncomparableValueAddable(7)))

        grouped_dict = {k: v for k, v in actual_group}

        # Keep this order-agnostic
        for k, v in expected_group:
            self.assertEqual(grouped_dict[k], v)
def test_splitting(test_case, text, expected):
    sc = Context()
    print("Test case:", test_case)
    ds = sc.parallelize([text])
    result = ds.flatMap(split_words).collect()
    assert result == expected
def test_case_insensitivity():
    text = "a a A A"
    sc = Context()
    ds = sc.parallelize([text])
    result = wordcount(ds).collect()
    assert result == [('a', 4)]
def test_ordering_words():
    text = "a b b c c c d d d d"
    sc = Context()
    ds = sc.parallelize([text])
    result = wordcount(ds).collect()
    assert result == [('d', 4), ('c', 3), ('b', 2), ('a', 1)]
Example #10
0
class RDDTest(unittest.TestCase):
    """ Tests for the resilient distributed databases """

    def setUp(self):
        self.context = Context()

    def testLeftOuterJoinSimple(self):
        """ Test the basic left outer join with simple key-value pairs """
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())
        zx = sorted(z.leftOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)),
                              ('b', ('xb', None)),
                              ('c', ('xc', 'zc'))])

        self.assertEqual(zx, [('c', ('zc', 'xc')),
                              ('d', ('zd', None))])

    @unittest.skip("Known failure")
    def testLeftOuterJoinDuplicate(self):
        """ Test the left outer join with duplicate keys """
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)),
                              ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2'))])

    def testRightOuterJoinSimple(self):
        """ Test the basic right outer join with simple key-value pairs """
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())
        zx = sorted(z.rightOuterJoin(x).collect())

        self.assertEqual(xy, [('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('c', ('xc', 'zc')),
                              ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')),
                              ('b', (None, 'xb')),
                              ('c', ('zc', 'xc'))])

    @unittest.skip("Known failure")
    def testRightOuterJoinDuplicate(self):
        """ Test the right outer join with duplicate keys """
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())

        self.assertEqual(xy, [('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def testFullOuterJoinSimple(self):
        """ Test the basic full outer join with simple key-value pairs """
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())
        zx = sorted(z.fullOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)),
                              ('b', ('xb', None)),
                              ('c', ('xc', 'zc')),
                              ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')),
                              ('b', (None, 'xb')),
                              ('c', ('zc', 'xc')),
                              ('d', ('zd', None))])

    @unittest.skip("Known failure")
    def testFullOuterJoinDuplicate(self):
        """ Test the full outer join with duplicate keys """
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)),
                              ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])
                currentUser = i.username
        if(count ==5):
                break

print('#### quantidade de lang=pt por hashtag #####')

def inserirByTag(x):
        print(x)
        session.execute("insert into resumebytag (uuid, hashtag, count) values (%s, %s, %s)", (random.randrange(10000, 30000), x[0], x[1]))

testeRDD = filter(lambda x: x[3] == 'pt', mostFollowersRDD)
teste2RDD = map(lambda x: (x[2], 1), testeRDD)

sc = Context()

teste3RDD = sc.parallelize(teste2RDD)

teste4RDD = teste3RDD.reduceByKey(lambda accum, n: accum + n)
teste4RDD.foreach(inserirByTag)


print('#### total de postagens/hora do dia #####')

def inserirByDayHour(x):
        print(x)
        session.execute("insert into resumebydayhour (uuid, dayhour, count) values (%s, %s, %s)", (random.randrange(10000, 30000), x[0], x[1]))

teste5RDD = sc.parallelize(mostFollowersRDD)

def agruparDate(x):
        diaHora = '{:%Y-%m-%d %H}'.format(x[1])
def test_union():
    sc = Context()
    rdd1 = sc.parallelize(["Hello"])
    rdd2 = sc.parallelize(["World"])
    union = sc.union([rdd1, rdd2]).collect()
    assert len(union) == 2 and "Hello" in union and "World" in union
Example #13
0
class RDDTest(unittest.TestCase):
    """ Tests for the resilient distributed databases """

    def setUp(self):
        self.context = Context()

    def testLeftOuterJoinSimple(self):
        """ Test the basic left outer join with simple key-value pairs """
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())
        zx = sorted(z.leftOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)),
                              ('b', ('xb', None)),
                              ('c', ('xc', 'zc'))])

        self.assertEqual(zx, [('c', ('zc', 'xc')),
                              ('d', ('zd', None))])

    def testLeftOuterJoinDuplicate(self):
        """ Test the left outer join with duplicate keys """
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)),
                              ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2'))])

    def testRightOuterJoinSimple(self):
        """ Test the basic right outer join with simple key-value pairs """
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())
        zx = sorted(z.rightOuterJoin(x).collect())

        self.assertEqual(xy, [('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('c', ('xc', 'zc')),
                              ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')),
                              ('b', (None, 'xb')),
                              ('c', ('zc', 'xc'))])

    def testRightOuterJoinDuplicate(self):
        """ Test the right outer join with duplicate keys """
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())

        self.assertEqual(xy, [('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def testFullOuterJoinSimple(self):
        """ Test the basic full outer join with simple key-value pairs """
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())
        zx = sorted(z.fullOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)),
                              ('b', ('xb', None)),
                              ('c', ('xc', 'zc')),
                              ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')),
                              ('b', (None, 'xb')),
                              ('c', ('zc', 'xc')),
                              ('d', ('zd', None))])

    def testFullOuterJoinDuplicate(self):
        """ Test the full outer join with duplicate keys """
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)),
                              ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def test_cartesian(self):
        x = self.context.parallelize(range(0, 2))
        y = self.context.parallelize(range(3, 6))
        c = x.cartesian(y)
        result = sorted(c.collect())
        expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)])
        self.assertListEqual(result, expected)

    def test_sample(self):
        rdd = self.context.parallelize(range(100), 4)
        self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14)

    def test_sampleByKey(self):
        fractions = {"a": 0.2, "b": 0.1}
        range_rdd = self.context.parallelize(range(0, 1000))
        rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd)
        sample = dict(
            rdd.sampleByKey(False, fractions, 2).groupByKey().collect()
        )
        self.assertTrue(100 < len(sample["a"]) < 300 and
                        50 < len(sample["b"]) < 150)
        self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0)
        self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0)
def test_union():
    sc = Context()
    rdd1 = sc.parallelize(['Hello'])
    rdd2 = sc.parallelize(['World'])
    union = sc.union([rdd1, rdd2]).collect()
    assert len(union) == 2 and 'Hello' in union and 'World' in union