Example #1
0
    def test_beansdb(self):
        N = 100
        l = list(range(N))
        d = list(zip(list(map(lambda x: str(x).encode('utf-8'), l)), l))
        num_splits = 10
        rdd = self.sc.makeRDD(d, num_splits)
        with temppath('beansdb') as root:

            def newpath(c):
                return os.path.join(root, str(c))

            def check_rdd(_rdd, files, num_w, num_r):
                self.assertEqual(
                    files, ['%s/%03d.data' % (path, i) for i in range(num_w)])
                self.assertEqual(len(_rdd), num_r)
                self.assertEqual(_rdd.count(), N)
                self.assertEqual(
                    sorted(
                        _rdd.map(lambda k_v: (k_v[0], k_v[1][0])).collect()),
                    sorted(d))
                s = _rdd.map(lambda x: x[1][0]).reduce(lambda x, y: x + y)
                self.assertEqual(s, sum(l))

            path = newpath(0)
            files = rdd.saveAsBeansdb(path)
            rdd = self.sc.beansdb(path, depth=0, filter=lambda x: x != "")
            check_rdd(rdd, files, num_splits, num_splits)

            path = newpath(1)
            files = rdd.saveAsBeansdb(path, valueWithMeta=True)
            rdd = self.sc.beansdb(path,
                                  depth=0,
                                  fullscan=True,
                                  only_latest=True)
            num_splits_reduce = int(ceil(num_splits / 4))
            check_rdd(rdd, files, num_splits, num_splits_reduce)

            path = newpath(num_splits_reduce)
            files = rdd.map(lambda k_v1: (k_v1[0], k_v1[1][0])).saveAsBeansdb(
                path)
            rdd = self.sc.beansdb(path, raw=True, depth=0, fullscan=True)
            rdd = rdd.mapValue(lambda v: (restore_value(*v[0]), v[1], v[2]))
            check_rdd(rdd, files, num_splits_reduce, num_splits_reduce)
Example #2
0
    def test_beansdb(self):
        N = 100
        l = range(N)
        d = zip(map(str, l), l)
        num_splits = 10
        rdd = self.sc.makeRDD(d, num_splits)
        root = '/tmp/beansdb'

        def newpath(c):
            return os.path.join(root, str(c))

        def check_rdd(_rdd, files, num_w, num_r):
            self.assertEqual(
                files, ['%s/%03d.data' % (path, i) for i in range(num_w)])
            self.assertEqual(len(_rdd), num_r)
            self.assertEqual(_rdd.count(), N)
            self.assertEqual(
                sorted(_rdd.map(lambda (k, v): (k, v[0])).collect()),
                sorted(d))
            s = _rdd.map(lambda x: x[1][0]).reduce(lambda x, y: x + y)
            self.assertEqual(s, sum(l))

        path = newpath(0)
        files = rdd.saveAsBeansdb(path)
        rdd = self.sc.beansdb(path, depth=0, filter=lambda x: x != "")
        check_rdd(rdd, files, num_splits, num_splits)

        path = newpath(1)
        files = rdd.saveAsBeansdb(path, valueWithMeta=True)
        rdd = self.sc.beansdb(path, depth=0, fullscan=True, only_latest=True)
        num_splits_reduce = int(ceil(num_splits / 4))
        check_rdd(rdd, files, num_splits, num_splits_reduce)

        path = newpath(num_splits_reduce)
        files = rdd.map(lambda (k, v): (k, v[0])).saveAsBeansdb(path)
        rdd = self.sc.beansdb(path, raw=True, depth=0, fullscan=True)
        rdd = rdd.mapValue(lambda v: (restore_value(*v[0]), v[1], v[2]))
        check_rdd(rdd, files, num_splits_reduce, num_splits_reduce)

        shutil.rmtree(root)
Example #3
0
    def test_beansdb(self):
        N = 100
        l = range(N)
        d = zip(map(str, l), l)
        num_splits = 10
        rdd = self.sc.makeRDD(d, num_splits)
        root = '/tmp/beansdb'

        def newpath(c):
            return  os.path.join(root, str(c))

        def check_rdd(_rdd, files, num_w, num_r):
            self.assertEqual(files,
                ['%s/%03d.data' % (path, i) for i in range(num_w)])
            self.assertEqual(len(_rdd), num_r)
            self.assertEqual(_rdd.count(), N)
            self.assertEqual(sorted(_rdd.map(lambda (k,v):(k,v[0])).collect()), sorted(d))
            s = _rdd.map(lambda x:x[1][0]).reduce(lambda x,y:x+y)
            self.assertEqual(s, sum(l))

        path = newpath(0)
        files = rdd.saveAsBeansdb(path)
        rdd = self.sc.beansdb(path, depth=0, filter=lambda x: x!="")
        check_rdd(rdd, files, num_splits, num_splits)

        path = newpath(1)
        files = rdd.saveAsBeansdb(path, valueWithMeta=True)
        rdd = self.sc.beansdb(path, depth=0, fullscan=True, only_latest=True)
        num_splits_reduce = int(ceil(num_splits/4))
        check_rdd(rdd, files, num_splits, num_splits_reduce)

        path = newpath(num_splits_reduce)
        files = rdd.map(lambda (k,v):(k,v[0])).saveAsBeansdb(path)
        rdd = self.sc.beansdb(path, raw=True, depth=0, fullscan=True)
        rdd = rdd.mapValue(lambda v:(restore_value(*v[0]), v[1], v[2]))
        check_rdd(rdd, files, num_splits_reduce, num_splits_reduce)

        shutil.rmtree(root)
Example #4
0
    def beansdb(self,
                path,
                depth=None,
                filter=None,
                fullscan=False,
                raw=False,
                only_latest=False):
        '''(Key, (VALUE, Version, Timestamp)) data in beansdb

        Data structure:
            REC = (Key, TRIPLE)
            TRIPLE = (VALUE, Version, Timestamp)
            VALUE = RAW_VALUE | REAL_VALUE
            RAW_VALUE = (flag, BYTES_VALUE)

        Args:
            filter: used to filter key
            depth: choice = [None, 0, 1, 2]. e.g. depth=2 assume dir tree like:
                    'path/[0-F]/[0-F]/%03d.data'
                If depth is None, dpark will guess.
            fullscan: NOT use index files, which contain (key, pos_in_datafile).
                pairs.
                Better use fullscan unless the filter selectivity is low.
                Effect of using index:
                    inefficient random access
                    one split(task) for each file instead of each moosefs chunk

                Omitted if filter is None.
            raw: VALUE = RAW_VALUE if raw else REAL_VALUE.
            only_latest: for each key, keeping the REC with the largest
                Timestamp. This will append a reduceByKey RDD.
                Need this because online beansdb data is log structured.
        '''

        key_filter = filter

        self.init()
        if key_filter is None:
            fullscan = True
        if isinstance(path, (tuple, list)):
            rdd = self.union([
                self.beansdb(p,
                             depth,
                             key_filter,
                             fullscan,
                             raw=True,
                             only_latest=False) for p in path
            ])
        else:
            path = os.path.realpath(path)
            assert os.path.exists(path), "%s no exists" % path
            if os.path.isdir(path):
                subs = []
                if not depth:
                    subs = [
                        os.path.join(path, n) for n in os.listdir(path)
                        if n.endswith('.data')
                    ]
                if subs:
                    rdd = self.union([
                        BeansdbFileRDD(self, p, key_filter, fullscan, raw=True)
                        for p in subs
                    ])
                else:
                    subs = [os.path.join(path, '%x' % i) for i in range(16)]
                    rdd = self.union([
                        self.beansdb(p,
                                     depth and depth - 1,
                                     key_filter,
                                     fullscan,
                                     raw=True,
                                     only_latest=False) for p in subs
                        if os.path.exists(p)
                    ])
            else:
                rdd = BeansdbFileRDD(self, path, key_filter, fullscan, raw)

        # choose only latest version
        if only_latest:
            numSplits = min(int(ceil(len(rdd) / 4)), 800)
            rdd = rdd.reduceByKey(lambda v1, v2: v1[2] > v2[2] and v1 or v2,
                                  numSplits=numSplits)
        if not raw:
            rdd = rdd.mapValue(lambda v_ver_t: (restore_value(*v_ver_t[0]),
                                                v_ver_t[1], v_ver_t[2]))
        return rdd
Example #5
0
    def beansdb(self, path, depth=None, filter=None,
                fullscan=False, raw=False, only_latest=False):
        '''(Key, (VALUE, Version, Timestamp)) data in beansdb

        Data structure:
            REC = (Key, TRIPLE)
            TRIPLE = (VALUE, Version, Timestamp)
            VALUE = RAW_VALUE | REAL_VALUE
            RAW_VALUE = (flag, BYTES_VALUE)

        Args:
            filter: used to filter key
            depth: choice = [None, 0, 1, 2]. e.g. depth=2 assume dir tree like:
                    'path/[0-F]/[0-F]/%03d.data'
                If depth is None, dpark will guess.
            fullscan: NOT use index files, which contain (key, pos_in_datafile).
                pairs.
                Better use fullscan unless the filter selectivity is low.
                Effect of using index:
                    inefficient random access
                    one split(task) for each file instead of each moosefs chunk

                Omitted if filter is None.
            raw: VALUE = RAW_VALUE if raw else REAL_VALUE.
            only_latest: for each key, keeping the REC with the largest
                Timestamp. This will append a reduceByKey RDD.
                Need this because online beansdb data is log structured.
        '''

        key_filter = filter

        self.init()
        if key_filter is None:
            fullscan = True
        if isinstance(path, (tuple, list)):
            return self.union([self.beansdb(p, depth, key_filter, fullscan,
                                            raw, only_latest)
                    for p in path])

        path = os.path.realpath(path)
        assert os.path.exists(path), "%s no exists" % path
        if os.path.isdir(path):
            subs = []
            if not depth:
                subs = [os.path.join(path, n) for n in os.listdir(path)
                        if n.endswith('.data')]
            if subs:
                rdd = self.union([BeansdbFileRDD(self, p, key_filter,
                                                 fullscan, raw=True)
                        for p in subs])
            else:
                subs = [os.path.join(path, '%x'%i) for i in range(16)]
                rdd = self.union([self.beansdb(p, depth and depth-1, key_filter,
                                               fullscan, True, only_latest)
                        for p in subs if os.path.exists(p)])
                only_latest = False
        else:
            rdd = BeansdbFileRDD(self, path, key_filter, fullscan, raw=True)

        # choose only latest version
        if only_latest:
            rdd = rdd.reduceByKey(lambda v1,v2: v1[2] > v2[2] and v1 or v2,
                                  int(ceil(len(rdd) / 4)))
        if not raw:
            rdd = rdd.mapValue(lambda (v,ver,t): (restore_value(*v), ver, t))
        return rdd