Example #1
0
    def test_beansdb(self):
        N = 100
        l = list(range(N))
        d = list(zip(list(map(lambda x: str(x).encode('utf-8'), l)), l))
        num_splits = 10
        rdd = self.sc.makeRDD(d, num_splits)
        with temppath('beansdb') as root:
            def newpath(c):
                return os.path.join(root, str(c))

            def check_rdd(_rdd, files, num_w, num_r):
                self.assertEqual(files,
                                 ['%s/%03d.data' % (path, i) for i in range(num_w)])
                self.assertEqual(len(_rdd), num_r)
                self.assertEqual(_rdd.count(), N)
                self.assertEqual(sorted(_rdd.map(lambda k_v: (k_v[0], k_v[1][0])).collect()), sorted(d))
                s = _rdd.map(lambda x: x[1][0]).reduce(lambda x, y: x + y)
                self.assertEqual(s, sum(l))

            path = newpath(0)
            files = rdd.saveAsBeansdb(path)
            rdd = self.sc.beansdb(path, depth=0, filter=lambda x: x != "")
            check_rdd(rdd, files, num_splits, num_splits)

            path = newpath(1)
            files = rdd.saveAsBeansdb(path, valueWithMeta=True)
            rdd = self.sc.beansdb(path, depth=0, fullscan=True, only_latest=True)
            num_splits_reduce = int(ceil(num_splits / 4))
            check_rdd(rdd, files, num_splits, num_splits_reduce)

            path = newpath(num_splits_reduce)
            files = rdd.map(lambda k_v1: (k_v1[0], k_v1[1][0])).saveAsBeansdb(path)
            rdd = self.sc.beansdb(path, raw=True, depth=0, fullscan=True)
            rdd = rdd.mapValue(lambda v: (restore_value(*v[0]), v[1], v[2]))
            check_rdd(rdd, files, num_splits_reduce, num_splits_reduce)
Example #2
0
    def test_beansdb(self):
        N = 100
        l = list(range(N))
        d = list(zip(list(map(lambda x: str(x).encode('utf-8'), l)), l))
        num_splits = 10
        rdd = self.sc.makeRDD(d, num_splits)
        with temppath('beansdb') as root:

            def newpath(c):
                return os.path.join(root, str(c))

            def check_rdd(_rdd, files, num_w, num_r):
                self.assertEqual(
                    files, ['%s/%03d.data' % (path, i) for i in range(num_w)])
                self.assertEqual(len(_rdd), num_r)
                self.assertEqual(_rdd.count(), N)
                self.assertEqual(
                    sorted(
                        _rdd.map(lambda k_v: (k_v[0], k_v[1][0])).collect()),
                    sorted(d))
                s = _rdd.map(lambda x: x[1][0]).reduce(lambda x, y: x + y)
                self.assertEqual(s, sum(l))

            path = newpath(0)
            files = rdd.saveAsBeansdb(path)
            rdd = self.sc.beansdb(path, depth=0, filter=lambda x: x != "")
            check_rdd(rdd, files, num_splits, num_splits)

            path = newpath(1)
            files = rdd.saveAsBeansdb(path, valueWithMeta=True)
            rdd = self.sc.beansdb(path,
                                  depth=0,
                                  fullscan=True,
                                  only_latest=True)
            num_splits_reduce = int(ceil(num_splits / 4))
            check_rdd(rdd, files, num_splits, num_splits_reduce)

            path = newpath(num_splits_reduce)
            files = rdd.map(lambda k_v1: (k_v1[0], k_v1[1][0])).saveAsBeansdb(
                path)
            rdd = self.sc.beansdb(path, raw=True, depth=0, fullscan=True)
            rdd = rdd.mapValue(lambda v: (restore_value(*v[0]), v[1], v[2]))
            check_rdd(rdd, files, num_splits_reduce, num_splits_reduce)
Example #3
0
    def beansdb(self,
                path,
                depth=None,
                filter=None,
                fullscan=False,
                raw=False,
                only_latest=False):
        """(Key, (VALUE, Version, Timestamp)) data in beansdb

        Data structure:
            REC = (Key, TRIPLE)
            TRIPLE = (VALUE, Version, Timestamp)
            VALUE = RAW_VALUE | REAL_VALUE
            RAW_VALUE = (flag, BYTES_VALUE)

        Args:
            path: beansdb data path
            filter: used to filter key
            depth: choice = [None, 0, 1, 2]. e.g. depth=2 assume dir tree like:
                    'path/[0-F]/[0-F]/%03d.data'
                If depth is None, dpark will guess.
            fullscan: NOT use index files, which contain (key, pos_in_datafile).
                pairs.
                Better use fullscan unless the filter selectivity is low.
                Effect of using index:
                    inefficient random access
                    one split(task) for each file instead of each moosefs chunk

                Omitted if filter is None.
            raw: VALUE = RAW_VALUE if raw else REAL_VALUE.
            only_latest: for each key, keeping the REC with the largest
                Timestamp. This will append a reduceByKey RDD.
                Need this because online beansdb data is log structured.
        """

        key_filter = filter

        self.init()
        if key_filter is None:
            fullscan = True
        if isinstance(path, (tuple, list)):
            rdd = self.union([
                self.beansdb(p,
                             depth,
                             key_filter,
                             fullscan,
                             raw=True,
                             only_latest=False) for p in path
            ])
        else:
            path = os.path.realpath(path)
            assert os.path.exists(path), "%s no exists" % path
            if os.path.isdir(path):
                subs = []
                if not depth:
                    subs = [
                        os.path.join(path, n) for n in os.listdir(path)
                        if n.endswith('.data')
                    ]
                if subs:
                    rdd = self.union([
                        BeansdbFileRDD(self, p, key_filter, fullscan, raw=True)
                        for p in subs
                    ])
                else:
                    subs = [os.path.join(path, '%x' % i) for i in range(16)]
                    rdd = self.union([
                        self.beansdb(p,
                                     depth and depth - 1,
                                     key_filter,
                                     fullscan,
                                     raw=True,
                                     only_latest=False) for p in subs
                        if os.path.exists(p)
                    ])
            else:
                rdd = BeansdbFileRDD(self, path, key_filter, fullscan, raw)

        # choose only latest version
        if only_latest:
            num_splits = min(int(ceil(len(rdd) / 4)), 800)
            rdd = rdd.reduceByKey(lambda v1, v2: v1[2] > v2[2] and v1 or v2,
                                  numSplits=num_splits)
        if not raw:
            rdd = rdd.mapValue(lambda v_ver_t: (restore_value(*v_ver_t[0]),
                                                v_ver_t[1], v_ver_t[2]))
        return rdd
Example #4
0
    def beansdb(self, path, depth=None, filter=None,
                fullscan=False, raw=False, only_latest=False):
        """(Key, (VALUE, Version, Timestamp)) data in beansdb

        Data structure:
            REC = (Key, TRIPLE)
            TRIPLE = (VALUE, Version, Timestamp)
            VALUE = RAW_VALUE | REAL_VALUE
            RAW_VALUE = (flag, BYTES_VALUE)

        Args:
            path: beansdb data path
            filter: used to filter key
            depth: choice = [None, 0, 1, 2]. e.g. depth=2 assume dir tree like:
                    'path/[0-F]/[0-F]/%03d.data'
                If depth is None, dpark will guess.
            fullscan: NOT use index files, which contain (key, pos_in_datafile).
                pairs.
                Better use fullscan unless the filter selectivity is low.
                Effect of using index:
                    inefficient random access
                    one split(task) for each file instead of each moosefs chunk

                Omitted if filter is None.
            raw: VALUE = RAW_VALUE if raw else REAL_VALUE.
            only_latest: for each key, keeping the REC with the largest
                Timestamp. This will append a reduceByKey RDD.
                Need this because online beansdb data is log structured.
        """

        key_filter = filter

        self.init()
        if key_filter is None:
            fullscan = True
        if isinstance(path, (tuple, list)):
            rdd = self.union([self.beansdb(p, depth, key_filter, fullscan,
                                           raw=True, only_latest=False)
                              for p in path])
        else:
            path = os.path.realpath(path)
            assert os.path.exists(path), "%s no exists" % path
            if os.path.isdir(path):
                subs = []
                if not depth:
                    subs = [os.path.join(path, n) for n in os.listdir(path)
                            if n.endswith('.data')]
                if subs:
                    rdd = self.union([BeansdbFileRDD(self, p, key_filter,
                                                     fullscan, raw=True)
                                      for p in subs])
                else:
                    subs = [os.path.join(path, '%x' % i) for i in range(16)]
                    rdd = self.union([self.beansdb(p, depth and depth - 1, key_filter,
                                                   fullscan, raw=True, only_latest=False)
                                      for p in subs if os.path.exists(p)])
            else:
                rdd = BeansdbFileRDD(self, path, key_filter, fullscan, raw)

        # choose only latest version
        if only_latest:
            num_splits = min(int(ceil(len(rdd) / 4)), 800)
            rdd = rdd.reduceByKey(lambda v1, v2: v1[2] > v2[2] and v1 or v2,
                                  numSplits=num_splits)
        if not raw:
            rdd = rdd.mapValue(lambda v_ver_t: (restore_value(*v_ver_t[0]), v_ver_t[1], v_ver_t[2]))
        return rdd