def test_beansdb(self): N = 100 l = list(range(N)) d = list(zip(list(map(lambda x: str(x).encode('utf-8'), l)), l)) num_splits = 10 rdd = self.sc.makeRDD(d, num_splits) with temppath('beansdb') as root: def newpath(c): return os.path.join(root, str(c)) def check_rdd(_rdd, files, num_w, num_r): self.assertEqual( files, ['%s/%03d.data' % (path, i) for i in range(num_w)]) self.assertEqual(len(_rdd), num_r) self.assertEqual(_rdd.count(), N) self.assertEqual( sorted( _rdd.map(lambda k_v: (k_v[0], k_v[1][0])).collect()), sorted(d)) s = _rdd.map(lambda x: x[1][0]).reduce(lambda x, y: x + y) self.assertEqual(s, sum(l)) path = newpath(0) files = rdd.saveAsBeansdb(path) rdd = self.sc.beansdb(path, depth=0, filter=lambda x: x != "") check_rdd(rdd, files, num_splits, num_splits) path = newpath(1) files = rdd.saveAsBeansdb(path, valueWithMeta=True) rdd = self.sc.beansdb(path, depth=0, fullscan=True, only_latest=True) num_splits_reduce = int(ceil(num_splits / 4)) check_rdd(rdd, files, num_splits, num_splits_reduce) path = newpath(num_splits_reduce) files = rdd.map(lambda k_v1: (k_v1[0], k_v1[1][0])).saveAsBeansdb( path) rdd = self.sc.beansdb(path, raw=True, depth=0, fullscan=True) rdd = rdd.mapValue(lambda v: (restore_value(*v[0]), v[1], v[2])) check_rdd(rdd, files, num_splits_reduce, num_splits_reduce)
def test_beansdb(self): N = 100 l = range(N) d = zip(map(str, l), l) num_splits = 10 rdd = self.sc.makeRDD(d, num_splits) root = '/tmp/beansdb' def newpath(c): return os.path.join(root, str(c)) def check_rdd(_rdd, files, num_w, num_r): self.assertEqual( files, ['%s/%03d.data' % (path, i) for i in range(num_w)]) self.assertEqual(len(_rdd), num_r) self.assertEqual(_rdd.count(), N) self.assertEqual( sorted(_rdd.map(lambda (k, v): (k, v[0])).collect()), sorted(d)) s = _rdd.map(lambda x: x[1][0]).reduce(lambda x, y: x + y) self.assertEqual(s, sum(l)) path = newpath(0) files = rdd.saveAsBeansdb(path) rdd = self.sc.beansdb(path, depth=0, filter=lambda x: x != "") check_rdd(rdd, files, num_splits, num_splits) path = newpath(1) files = rdd.saveAsBeansdb(path, valueWithMeta=True) rdd = self.sc.beansdb(path, depth=0, fullscan=True, only_latest=True) num_splits_reduce = int(ceil(num_splits / 4)) check_rdd(rdd, files, num_splits, num_splits_reduce) path = newpath(num_splits_reduce) files = rdd.map(lambda (k, v): (k, v[0])).saveAsBeansdb(path) rdd = self.sc.beansdb(path, raw=True, depth=0, fullscan=True) rdd = rdd.mapValue(lambda v: (restore_value(*v[0]), v[1], v[2])) check_rdd(rdd, files, num_splits_reduce, num_splits_reduce) shutil.rmtree(root)
def test_beansdb(self): N = 100 l = range(N) d = zip(map(str, l), l) num_splits = 10 rdd = self.sc.makeRDD(d, num_splits) root = '/tmp/beansdb' def newpath(c): return os.path.join(root, str(c)) def check_rdd(_rdd, files, num_w, num_r): self.assertEqual(files, ['%s/%03d.data' % (path, i) for i in range(num_w)]) self.assertEqual(len(_rdd), num_r) self.assertEqual(_rdd.count(), N) self.assertEqual(sorted(_rdd.map(lambda (k,v):(k,v[0])).collect()), sorted(d)) s = _rdd.map(lambda x:x[1][0]).reduce(lambda x,y:x+y) self.assertEqual(s, sum(l)) path = newpath(0) files = rdd.saveAsBeansdb(path) rdd = self.sc.beansdb(path, depth=0, filter=lambda x: x!="") check_rdd(rdd, files, num_splits, num_splits) path = newpath(1) files = rdd.saveAsBeansdb(path, valueWithMeta=True) rdd = self.sc.beansdb(path, depth=0, fullscan=True, only_latest=True) num_splits_reduce = int(ceil(num_splits/4)) check_rdd(rdd, files, num_splits, num_splits_reduce) path = newpath(num_splits_reduce) files = rdd.map(lambda (k,v):(k,v[0])).saveAsBeansdb(path) rdd = self.sc.beansdb(path, raw=True, depth=0, fullscan=True) rdd = rdd.mapValue(lambda v:(restore_value(*v[0]), v[1], v[2])) check_rdd(rdd, files, num_splits_reduce, num_splits_reduce) shutil.rmtree(root)
def beansdb(self, path, depth=None, filter=None, fullscan=False, raw=False, only_latest=False): '''(Key, (VALUE, Version, Timestamp)) data in beansdb Data structure: REC = (Key, TRIPLE) TRIPLE = (VALUE, Version, Timestamp) VALUE = RAW_VALUE | REAL_VALUE RAW_VALUE = (flag, BYTES_VALUE) Args: filter: used to filter key depth: choice = [None, 0, 1, 2]. e.g. depth=2 assume dir tree like: 'path/[0-F]/[0-F]/%03d.data' If depth is None, dpark will guess. fullscan: NOT use index files, which contain (key, pos_in_datafile). pairs. Better use fullscan unless the filter selectivity is low. Effect of using index: inefficient random access one split(task) for each file instead of each moosefs chunk Omitted if filter is None. raw: VALUE = RAW_VALUE if raw else REAL_VALUE. only_latest: for each key, keeping the REC with the largest Timestamp. This will append a reduceByKey RDD. Need this because online beansdb data is log structured. ''' key_filter = filter self.init() if key_filter is None: fullscan = True if isinstance(path, (tuple, list)): rdd = self.union([ self.beansdb(p, depth, key_filter, fullscan, raw=True, only_latest=False) for p in path ]) else: path = os.path.realpath(path) assert os.path.exists(path), "%s no exists" % path if os.path.isdir(path): subs = [] if not depth: subs = [ os.path.join(path, n) for n in os.listdir(path) if n.endswith('.data') ] if subs: rdd = self.union([ BeansdbFileRDD(self, p, key_filter, fullscan, raw=True) for p in subs ]) else: subs = [os.path.join(path, '%x' % i) for i in range(16)] rdd = self.union([ self.beansdb(p, depth and depth - 1, key_filter, fullscan, raw=True, only_latest=False) for p in subs if os.path.exists(p) ]) else: rdd = BeansdbFileRDD(self, path, key_filter, fullscan, raw) # choose only latest version if only_latest: numSplits = min(int(ceil(len(rdd) / 4)), 800) rdd = rdd.reduceByKey(lambda v1, v2: v1[2] > v2[2] and v1 or v2, numSplits=numSplits) if not raw: rdd = rdd.mapValue(lambda v_ver_t: (restore_value(*v_ver_t[0]), v_ver_t[1], v_ver_t[2])) return rdd
def beansdb(self, path, depth=None, filter=None, fullscan=False, raw=False, only_latest=False): '''(Key, (VALUE, Version, Timestamp)) data in beansdb Data structure: REC = (Key, TRIPLE) TRIPLE = (VALUE, Version, Timestamp) VALUE = RAW_VALUE | REAL_VALUE RAW_VALUE = (flag, BYTES_VALUE) Args: filter: used to filter key depth: choice = [None, 0, 1, 2]. e.g. depth=2 assume dir tree like: 'path/[0-F]/[0-F]/%03d.data' If depth is None, dpark will guess. fullscan: NOT use index files, which contain (key, pos_in_datafile). pairs. Better use fullscan unless the filter selectivity is low. Effect of using index: inefficient random access one split(task) for each file instead of each moosefs chunk Omitted if filter is None. raw: VALUE = RAW_VALUE if raw else REAL_VALUE. only_latest: for each key, keeping the REC with the largest Timestamp. This will append a reduceByKey RDD. Need this because online beansdb data is log structured. ''' key_filter = filter self.init() if key_filter is None: fullscan = True if isinstance(path, (tuple, list)): return self.union([self.beansdb(p, depth, key_filter, fullscan, raw, only_latest) for p in path]) path = os.path.realpath(path) assert os.path.exists(path), "%s no exists" % path if os.path.isdir(path): subs = [] if not depth: subs = [os.path.join(path, n) for n in os.listdir(path) if n.endswith('.data')] if subs: rdd = self.union([BeansdbFileRDD(self, p, key_filter, fullscan, raw=True) for p in subs]) else: subs = [os.path.join(path, '%x'%i) for i in range(16)] rdd = self.union([self.beansdb(p, depth and depth-1, key_filter, fullscan, True, only_latest) for p in subs if os.path.exists(p)]) only_latest = False else: rdd = BeansdbFileRDD(self, path, key_filter, fullscan, raw=True) # choose only latest version if only_latest: rdd = rdd.reduceByKey(lambda v1,v2: v1[2] > v2[2] and v1 or v2, int(ceil(len(rdd) / 4))) if not raw: rdd = rdd.mapValue(lambda (v,ver,t): (restore_value(*v), ver, t)) return rdd