def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true( check_rdd_dtype(rdd, { 'x': np.ndarray, 'y': tuple, 'z': tuple }))
def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {'x': np.ndarray, 'y': tuple, 'z': tuple}))
def test_creation_from_blocked_rdds(self): x, y, z = np.arange(80).reshape((40, 2)), np.arange(40), range(40) x_rdd = ArrayRDD(self.sc.parallelize(x, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list) expected = (np.arange(20).reshape(10, 2), np.arange(10), range(10)) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_creation_from_zipped_rdd(self): x = np.arange(80).reshape((40, 2)) y = range(40) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) zipped_rdd = x_rdd.zip(y_rdd) expected = (np.arange(20).reshape(10, 2), tuple(range(10))) rdd = DictRDD(zipped_rdd) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD(zipped_rdd, columns=('x', 'y')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD(zipped_rdd, dtype=(np.ndarray, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[1], list)
def test_get_single_tuple(self): x, y = np.arange(80).reshape((40, 2)), np.arange(40) x_rdd = self.sc.parallelize(x, 2) y_rdd = self.sc.parallelize(y, 2) z_rdd = x_rdd.zip(y_rdd) z = DictRDD(z_rdd, bsize=5) expected = np.arange(0, 10).reshape((5, 2)), np.arange(5) for tpl in [z.first(), z[0].first(), z[0].first()]: assert_tuple_equal(tpl, expected) expected = np.arange(30, 40).reshape((5, 2)), np.arange(15, 20) for tpl in [z[3].first(), z[3].first(), z[-5].first()]: assert_tuple_equal(tpl, expected) expected = np.arange(70, 80).reshape((5, 2)), np.arange(35, 40) for tpl in [z[7].first(), z[7].first(), z[-1].first()]: assert_tuple_equal(tpl, expected)
def test_creation_from_blocked_rdds(self): x = np.arange(80).reshape((40, 2)) y = np.arange(40) z = list(range(40)) x_rdd = ArrayRDD(self.sc.parallelize(x, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list) expected = (np.arange(20).reshape(10, 2), np.arange(10), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_creation_from_rdds(self): x = np.arange(80).reshape((40, 2)) y = np.arange(40) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = ( np.arange(20).reshape(10, 2), np.arange(10), list(range(10)) ) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(np.ndarray, np.ndarray, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)