def stat_input_stream(fd, size, url, params): from disco import util from hustle.core.marble import MarbleStream try: scheme, netloc, rest = util.urlsplit(url) except Exception as e: print "Error handling hustle_input_stream for %s. %s" % (url, e) raise e otab = None try: # print "FLurlG: %s" % url fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data) # print "FLOGLE: %s" % fle otab = MarbleStream(fle) rows = otab.number_rows frows = float(rows) rval = {'_': rows, } for field, (subdb, subindexdb, _, column, _) in otab.dbs.iteritems(): if subindexdb: rval[field] = subindexdb.stat(otab.txn)['ms_entries'] / frows yield '', rval except Exception as e: print "Gibbers: %s" % e raise e finally: if otab: otab.close()
def stat_input_stream(fd, size, url, params): from disco import util from hustle.core.marble import MarbleStream try: scheme, netloc, rest = util.urlsplit(url) except Exception as e: print "Error handling hustle_input_stream for %s. %s" % (url, e) raise e otab = None try: # print "FLurlG: %s" % url fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data) # print "FLOGLE: %s" % fle otab = MarbleStream(fle) rows = otab.number_rows frows = float(rows) rval = { '_': rows, } for field, (subdb, subindexdb, _, column, _) in otab.dbs.iteritems(): if subindexdb: rval[field] = subindexdb.stat(otab.txn)['ms_entries'] / frows yield '', rval except Exception as e: print "Gibbers: %s" % e raise e finally: if otab: otab.close()
def test_marble_stream_get(self): for date, file in self.files.iteritems(): stream = MarbleStream(file) rowid = 1 for album in self.albums: if album[_PARTITIONS] != date: continue # test 'get' first for k, v in album.iteritems(): self.assertEqual(v, stream.get(k, rowid)) rowid += 1 stream.close()
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names): from disco import util from hustle.core.marble import Expr, MarbleStream from itertools import izip, repeat empty = () try: scheme, netloc, rest = util.urlsplit(url) except Exception as e: msg = "Error handling hustle_input_stream for %s. %s" % (url, e) raise util.DataError(msg, url) fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data) # print "FLOGLE: %s %s" % (url, fle) otab = None try: # import sys # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg') # import pydevd # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True) otab = MarbleStream(fle) bitmaps = {} for index, where in enumerate(wheres): # do not process where clauses that have nothing to do with this marble if where._name == otab.marble._name: if type(where) is Expr and not where.is_partition: bm = where(otab) bitmaps[index] = (bm, len(bm)) else: # it is either the table itself, or a partition expression. # Either way, returns the entire table bitmaps[index] = (otab.iter_all(), otab.number_rows) for index, (bitmap, blen) in bitmaps.iteritems(): prefix_gen = [repeat(index, blen)] if gen_where_index else [] row_iter = prefix_gen + \ [otab.mget(col, bitmap) if col is not None else repeat(None, blen) for col in key_names[index]] for row in izip(*row_iter): yield row, empty finally: if otab: otab.close()
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names): from disco import util from hustle.core.marble import Expr, MarbleStream empty = () try: scheme, netloc, rest = util.urlsplit(url) except Exception as e: print "Error handling hustle_input_stream for %s. %s" % (url, e) raise e fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data) # print "FLOGLE: %s %s" % (url, fle) otab = None try: # import sys # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg') # import pydevd # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True) otab = MarbleStream(fle) bitmaps = {} for index, where in enumerate(wheres): # do not process where clauses that have nothing to do with this marble if where._name == otab.marble._name: if type(where) is Expr and not where.is_partition: bitmaps[index] = where(otab) else: # it is either the table itself, or a partition expression. either way, # return the entire table bitmaps[index] = otab.iter_all() for index, bitmap in bitmaps.iteritems(): prefix = [index] if gen_where_index else [] for row_id in bitmap: record = [ otab.get(col, row_id) if col else None for col in key_names[index] ] # print "Gibbled: %s" % repr(record) record[ 0: 0] = prefix # this looks odd, but is faster than 'prefix + record' yield tuple(record), empty finally: if otab: otab.close()
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names): from disco import util from hustle.core.marble import Expr, MarbleStream from itertools import izip, repeat empty = () try: scheme, netloc, rest = util.urlsplit(url) except Exception as e: print "Error handling hustle_input_stream for %s. %s" % (url, e) raise e fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data) # print "FLOGLE: %s %s" % (url, fle) otab = None try: # import sys # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg') # import pydevd # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True) otab = MarbleStream(fle) bitmaps = {} for index, where in enumerate(wheres): # do not process where clauses that have nothing to do with this marble if where._name == otab.marble._name: if type(where) is Expr and not where.is_partition: bm = where(otab) bitmaps[index] = (bm, len(bm)) else: # it is either the table itself, or a partition expression. # Either way, returns the entire table bitmaps[index] = (otab.iter_all(), otab.number_rows) for index, (bitmap, blen) in bitmaps.iteritems(): prefix_gen = [repeat(index, blen)] if gen_where_index else [] row_iter = prefix_gen + \ [otab.mget(col, bitmap) if col is not None else repeat(None, blen) for col in key_names[index]] for row in izip(*row_iter): yield row, empty finally: if otab: otab.close()
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names): from disco import util from hustle.core.marble import Expr, MarbleStream empty = () try: scheme, netloc, rest = util.urlsplit(url) except Exception as e: print "Error handling hustle_input_stream for %s. %s" % (url, e) raise e fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data) # print "FLOGLE: %s %s" % (url, fle) otab = None try: # import sys # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg') # import pydevd # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True) otab = MarbleStream(fle) bitmaps = {} for index, where in enumerate(wheres): # do not process where clauses that have nothing to do with this marble if where._name == otab.marble._name: if type(where) is Expr and not where.is_partition: bitmaps[index] = where(otab) else: # it is either the table itself, or a partition expression. either way, # return the entire table bitmaps[index] = otab.iter_all() for index, bitmap in bitmaps.iteritems(): prefix = [index] if gen_where_index else [] for row_id in bitmap: record = [otab.get(col, row_id) if col else None for col in key_names[index]] # print "Gibbled: %s" % repr(record) record[0:0] = prefix # this looks odd, but is faster than 'prefix + record' yield tuple(record), empty finally: if otab: otab.close()
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names, limit): from disco import util from hustle.core.marble import Expr, MarbleStream from itertools import izip, repeat, islice, imap from sys import maxint from pyebset import BitSet empty = () try: scheme, netloc, rest = util.urlsplit(url) except Exception as e: msg = "Error handling hustle_input_stream for %s. %s" % (url, e) raise util.DataError(msg, url) fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data) otab = None try: otab = MarbleStream(fle) bitmaps = {} for index, where in enumerate(wheres): # do not process where clauses that have nothing to do with this marble if where._name == otab.marble._name: if type(where) is Expr and not where.is_partition: bm = where(otab) if limit != maxint: bs = BitSet() for i in islice(bm, 0, limit): bs.set(i) bitmaps[index] = (bs, len(bs)) else: bitmaps[index] = (bm, len(bm)) else: # it is either the table itself, or a partition expression. # Either way, returns the entire table if limit != maxint: bs = BitSet() for i in islice(otab.iter_all(), 0, limit): bs.set(i) bitmaps[index] = (bs, len(bs)) else: bitmaps[index] = (otab.iter_all(), otab.number_rows) for index, (bitmap, blen) in bitmaps.iteritems(): prefix_gen = [repeat(index, blen)] if gen_where_index else [] # row_iter = prefix_gen + \ # [otab.mget(col, bitmap) if col is not None else repeat(None, blen) # for col in key_names[index]] row_creators = [] for col, column_fn in key_names[index]: if col is not None: if column_fn is None: row_creators.append(otab.mget(col, bitmap)) else: row_creators.append( imap(column_fn, otab.mget(col, bitmap))) else: row_creators.append(repeat(None, blen)) row_iter = prefix_gen + row_creators for row in izip(*row_iter): yield row, empty finally: if otab: otab.close()
def test_marble_stream_bit_ops(self): stream = MarbleStream(self.files["1992-10-03"]) rowid = 1 # test "name" index for album in self.albums: if album[_PARTITIONS] != "1992-10-03": continue bitset = stream.bit_eq("name", album["name"]) bs = BitSet() bs.set(rowid) rowid += 1 for i in bitset: self.assertTrue(i in bs) # test "genre" index bitset = stream.bit_eq("genre", "R&R") bs = BitSet() for i in range(1, 5): bs.set(i) for i in bitset: self.assertTrue(i in bs) stream.close() stream = MarbleStream(self.files["1986-01-03"]) rowid = 1 # test "name" index for album in self.albums: if album[_PARTITIONS] != "1986-01-03": continue bitset = stream.bit_eq("name", album["name"]) bs = BitSet() bs.set(rowid) rowid += 1 for i in bitset: self.assertTrue(i in bs) # test "genre" index bitset = stream.bit_eq("genre", "SoundTrack") bs = BitSet() for i in range(1, 7): bs.set(i) for i in bitset: self.assertTrue(i in bs) # test "rating" index # test for eq and not-eq bitset = stream.bit_eq("rating", 4) bs = BitSet() bs.set(4) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_eq("rating", 3) bs = BitSet() bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_eq("rating", 5) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ne("rating", 5) bs = BitSet() bs.set(4) bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ne("rating", 3) bs = BitSet() for i in range(1, 6): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ne("rating", 4) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) bs.set(6) for i in bitset: self.assertTrue(i in bs) # test for less_than and less_eq bitset = stream.bit_ge("rating", 3) bs = BitSet() for i in range(1, 7): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_gt("rating", 3) bs = BitSet() for i in range(1, 6): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_le("rating", 3) bs = BitSet() bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_lt("rating", 3) bs = BitSet() for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_lt("rating", 5) bs = BitSet() bs.set(4) bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_le("rating", 5) bs = BitSet() for i in range(1, 7): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_gt("rating", 5) bs = BitSet() for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ge("rating", 5) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_le("rating", 4) bs = BitSet() bs.set(4) bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_lt("rating", 4) bs = BitSet() bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ge("rating", 4) bs = BitSet() for i in range(1, 6): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_gt("rating", 4) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) for i in bitset: self.assertTrue(i in bs) stream.close()
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names, limit): from disco import util from hustle.core.marble import Expr, MarbleStream from itertools import izip, repeat, islice, imap from sys import maxint from pyebset import BitSet empty = () try: scheme, netloc, rest = util.urlsplit(url) except Exception as e: msg = "Error handling hustle_input_stream for %s. %s" % (url, e) raise util.DataError(msg, url) fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data) otab = None try: otab = MarbleStream(fle) bitmaps = {} for index, where in enumerate(wheres): # do not process where clauses that have nothing to do with this marble if where._name == otab.marble._name: if type(where) is Expr and not where.is_partition: bm = where(otab) if limit != maxint: bs = BitSet() for i in islice(bm, 0, limit): bs.set(i) bitmaps[index] = (bs, len(bs)) else: bitmaps[index] = (bm, len(bm)) else: # it is either the table itself, or a partition expression. # Either way, returns the entire table if limit != maxint: bs = BitSet() for i in islice(otab.iter_all(), 0, limit): bs.set(i) bitmaps[index] = (bs, len(bs)) else: bitmaps[index] = (otab.iter_all(), otab.number_rows) for index, (bitmap, blen) in bitmaps.iteritems(): prefix_gen = [repeat(index, blen)] if gen_where_index else [] # row_iter = prefix_gen + \ # [otab.mget(col, bitmap) if col is not None else repeat(None, blen) # for col in key_names[index]] row_creators = [] for col, column_fn in key_names[index]: if col is not None: if column_fn is None: row_creators.append(otab.mget(col, bitmap)) else: row_creators.append(imap(column_fn, otab.mget(col, bitmap))) else: row_creators.append(repeat(None, blen)) row_iter = prefix_gen + row_creators for row in izip(*row_iter): yield row, empty finally: if otab: otab.close()
def test_marble_stream_bit_ops(self): stream = MarbleStream(self.files["1992-10-03"]) rowid = 1 # test "name" index for album in self.albums: if album[_PARTITIONS] != "1992-10-03": continue bitset = stream.bit_eq("name", album["name"]) bs = BitSet() bs.set(rowid) rowid += 1 for i in bitset: self.assertTrue(i in bs) # test "genre" index bitset = stream.bit_eq("genre", "R&R") bs = BitSet() for i in range(1, 5): bs.set(i) for i in bitset: self.assertTrue(i in bs) stream.close() stream = MarbleStream(self.files["1986-01-03"]) rowid = 1 # test "name" index for album in self.albums: if album[_PARTITIONS] != "1986-01-03": continue bitset = stream.bit_eq("name", album["name"]) bs = BitSet() bs.set(rowid) rowid += 1 for i in bitset: self.assertTrue(i in bs) # test "genre" index bitset = stream.bit_eq("genre", "SoundTrack") bs = BitSet() for i in range(1, 7): bs.set(i) for i in bitset: self.assertTrue(i in bs) # test "rating" index # test for eq and not-eq bitset = stream.bit_eq("rating", 4) bs = BitSet() bs.set(4) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_eq("rating", 3) bs = BitSet() bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_eq("rating", 5) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ne("rating", 5) bs = BitSet() bs.set(4) bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ne("rating", 3) bs = BitSet() for i in range(1, 6): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ne("rating", 4) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) bs.set(6) for i in bitset: self.assertTrue(i in bs) # test "rating" index # test for eq_ex and not_eq_ex bitset = stream.bit_eq_ex("rating", [3, 4]) bs = BitSet() bs.set(4) bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_eq_ex("rating", [5]) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ne_ex("rating", [5]) bs = BitSet() bs.set(4) bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ne_ex("rating", [3, 4]) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) for i in bitset: self.assertTrue(i in bs) # test for less_than and less_eq bitset = stream.bit_ge("rating", 3) bs = BitSet() for i in range(1, 7): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_gt("rating", 3) bs = BitSet() for i in range(1, 6): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_le("rating", 3) bs = BitSet() bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_lt("rating", 3) bs = BitSet() for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_lt("rating", 5) bs = BitSet() bs.set(4) bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_le("rating", 5) bs = BitSet() for i in range(1, 7): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_gt("rating", 5) bs = BitSet() for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ge("rating", 5) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_le("rating", 4) bs = BitSet() bs.set(4) bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_lt("rating", 4) bs = BitSet() bs.set(6) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_ge("rating", 4) bs = BitSet() for i in range(1, 6): bs.set(i) for i in bitset: self.assertTrue(i in bs) bitset = stream.bit_gt("rating", 4) bs = BitSet() for i in range(1, 4): bs.set(i) bs.set(5) for i in bitset: self.assertTrue(i in bs) stream.close()