Example #1
0
def stat_input_stream(fd, size, url, params):
    from disco import util
    from hustle.core.marble import MarbleStream

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        print "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise e

    otab = None
    try:
        # print "FLurlG: %s" % url
        fle = util.localize(rest, disco_data=params._task.disco_data,
                            ddfs_data=params._task.ddfs_data)
        # print "FLOGLE: %s" % fle
        otab = MarbleStream(fle)
        rows = otab.number_rows
        frows = float(rows)
        rval = {'_': rows, }
        for field, (subdb, subindexdb, _, column, _) in otab.dbs.iteritems():
            if subindexdb:
                rval[field] = subindexdb.stat(otab.txn)['ms_entries'] / frows
        yield '', rval
    except Exception as e:
        print "Gibbers: %s" % e
        raise e
    finally:
        if otab:
            otab.close()
Example #2
0
def stat_input_stream(fd, size, url, params):
    from disco import util
    from hustle.core.marble import MarbleStream

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        print "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise e

    otab = None
    try:
        # print "FLurlG: %s" % url
        fle = util.localize(rest,
                            disco_data=params._task.disco_data,
                            ddfs_data=params._task.ddfs_data)
        # print "FLOGLE: %s" % fle
        otab = MarbleStream(fle)
        rows = otab.number_rows
        frows = float(rows)
        rval = {
            '_': rows,
        }
        for field, (subdb, subindexdb, _, column, _) in otab.dbs.iteritems():
            if subindexdb:
                rval[field] = subindexdb.stat(otab.txn)['ms_entries'] / frows
        yield '', rval
    except Exception as e:
        print "Gibbers: %s" % e
        raise e
    finally:
        if otab:
            otab.close()
Example #3
0
 def test_marble_stream_get(self):
     for date, file in self.files.iteritems():
         stream = MarbleStream(file)
         rowid = 1
         for album in self.albums:
             if album[_PARTITIONS] != date:
                 continue
             # test 'get' first
             for k, v in album.iteritems():
                 self.assertEqual(v, stream.get(k, rowid))
             rowid += 1
         stream.close()
Example #4
0
 def test_marble_stream_get(self):
     for date, file in self.files.iteritems():
         stream = MarbleStream(file)
         rowid = 1
         for album in self.albums:
             if album[_PARTITIONS] != date:
                 continue
             # test 'get' first
             for k, v in album.iteritems():
                 self.assertEqual(v, stream.get(k, rowid))
             rowid += 1
         stream.close()
Example #5
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    from itertools import izip, repeat
    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        print "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise e

    fle = util.localize(rest, disco_data=params._task.disco_data,
                        ddfs_data=params._task.ddfs_data)
    # print "FLOGLE: %s %s" % (url, fle)

    otab = None
    try:
        # import sys
        # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
        # import pydevd
        # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
        otab = MarbleStream(fle)
        bitmaps = {}

        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bm = where(otab)
                    bitmaps[index] = (bm, len(bm))
                else:
                    # it is either the table itself, or a partition expression.
                    # Either way, returns the entire table
                    bitmaps[index] = (otab.iter_all(), otab.number_rows)

        for index, (bitmap, blen) in bitmaps.iteritems():
            prefix_gen = [repeat(index, blen)] if gen_where_index else []

            row_iter = prefix_gen + \
                [otab.mget(col, bitmap) if col is not None else repeat(None, blen)
                 for col in key_names[index]]

            for row in izip(*row_iter):
                yield row, empty

    finally:
        if otab:
            otab.close()
Example #6
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        print "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise e

    fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data)
    # print "FLOGLE: %s %s" % (url, fle)

    otab = None
    try:
        # import sys
        # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
        # import pydevd
        # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
        otab = MarbleStream(fle)
        bitmaps = {}
        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bitmaps[index] = where(otab)
                else:
                    # it is either the table itself, or a partition expression.  either way,
                    # return the entire table
                    bitmaps[index] = otab.iter_all()

        for index, bitmap in bitmaps.iteritems():
            prefix = [index] if gen_where_index else []
            for row_id in bitmap:
                record = [otab.get(col, row_id) if col else None for col in key_names[index]]
                # print "Gibbled: %s" % repr(record)
                record[0:0] = prefix  # this looks odd, but is faster than 'prefix + record'
                yield tuple(record), empty
    finally:
        if otab:
            otab.close()
Example #7
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index,
                        key_names):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    from itertools import izip, repeat
    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        msg = "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise util.DataError(msg, url)

    fle = util.localize(rest,
                        disco_data=params._task.disco_data,
                        ddfs_data=params._task.ddfs_data)
    # print "FLOGLE: %s %s" % (url, fle)

    otab = None
    try:
        # import sys
        # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
        # import pydevd
        # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
        otab = MarbleStream(fle)
        bitmaps = {}

        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bm = where(otab)
                    bitmaps[index] = (bm, len(bm))
                else:
                    # it is either the table itself, or a partition expression.
                    # Either way, returns the entire table
                    bitmaps[index] = (otab.iter_all(), otab.number_rows)

        for index, (bitmap, blen) in bitmaps.iteritems():
            prefix_gen = [repeat(index, blen)] if gen_where_index else []

            row_iter = prefix_gen + \
                [otab.mget(col, bitmap) if col is not None else repeat(None, blen)
                 for col in key_names[index]]

            for row in izip(*row_iter):
                yield row, empty
    finally:
        if otab:
            otab.close()
Example #8
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index,
                        key_names):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        print "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise e

    fle = util.localize(rest,
                        disco_data=params._task.disco_data,
                        ddfs_data=params._task.ddfs_data)
    # print "FLOGLE: %s %s" % (url, fle)

    otab = None
    try:
        # import sys
        # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
        # import pydevd
        # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
        otab = MarbleStream(fle)
        bitmaps = {}
        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bitmaps[index] = where(otab)
                else:
                    # it is either the table itself, or a partition expression.  either way,
                    # return the entire table
                    bitmaps[index] = otab.iter_all()

        for index, bitmap in bitmaps.iteritems():
            prefix = [index] if gen_where_index else []
            for row_id in bitmap:
                record = [
                    otab.get(col, row_id) if col else None
                    for col in key_names[index]
                ]
                # print "Gibbled: %s" % repr(record)
                record[
                    0:
                    0] = prefix  # this looks odd, but is faster than 'prefix + record'
                yield tuple(record), empty
    finally:
        if otab:
            otab.close()
Example #9
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index,
                        key_names, limit):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    from itertools import izip, repeat, islice, imap
    from sys import maxint
    from pyebset import BitSet

    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        msg = "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise util.DataError(msg, url)

    fle = util.localize(rest,
                        disco_data=params._task.disco_data,
                        ddfs_data=params._task.ddfs_data)

    otab = None
    try:
        otab = MarbleStream(fle)
        bitmaps = {}

        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bm = where(otab)
                    if limit != maxint:
                        bs = BitSet()
                        for i in islice(bm, 0, limit):
                            bs.set(i)
                        bitmaps[index] = (bs, len(bs))
                    else:
                        bitmaps[index] = (bm, len(bm))
                else:
                    # it is either the table itself, or a partition expression.
                    # Either way, returns the entire table
                    if limit != maxint:
                        bs = BitSet()
                        for i in islice(otab.iter_all(), 0, limit):
                            bs.set(i)
                        bitmaps[index] = (bs, len(bs))
                    else:
                        bitmaps[index] = (otab.iter_all(), otab.number_rows)

        for index, (bitmap, blen) in bitmaps.iteritems():
            prefix_gen = [repeat(index, blen)] if gen_where_index else []

            # row_iter = prefix_gen + \
            # [otab.mget(col, bitmap) if col is not None else repeat(None, blen)
            # for col in key_names[index]]
            row_creators = []
            for col, column_fn in key_names[index]:
                if col is not None:
                    if column_fn is None:
                        row_creators.append(otab.mget(col, bitmap))
                    else:
                        row_creators.append(
                            imap(column_fn, otab.mget(col, bitmap)))
                else:
                    row_creators.append(repeat(None, blen))
            row_iter = prefix_gen + row_creators

            for row in izip(*row_iter):
                yield row, empty
    finally:
        if otab:
            otab.close()
Example #10
0
    def test_marble_stream_bit_ops(self):
        stream = MarbleStream(self.files["1992-10-03"])
        rowid = 1
        # test "name" index
        for album in self.albums:
            if album[_PARTITIONS] != "1992-10-03":
                continue
            bitset = stream.bit_eq("name", album["name"])
            bs = BitSet()
            bs.set(rowid)
            rowid += 1
            for i in bitset:
                self.assertTrue(i in bs)
        # test "genre" index
        bitset = stream.bit_eq("genre", "R&R")
        bs = BitSet()
        for i in range(1, 5):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        stream.close()

        stream = MarbleStream(self.files["1986-01-03"])
        rowid = 1
        # test "name" index
        for album in self.albums:
            if album[_PARTITIONS] != "1986-01-03":
                continue
            bitset = stream.bit_eq("name", album["name"])
            bs = BitSet()
            bs.set(rowid)
            rowid += 1
            for i in bitset:
                self.assertTrue(i in bs)
        # test "genre" index
        bitset = stream.bit_eq("genre", "SoundTrack")
        bs = BitSet()
        for i in range(1, 7):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        # test "rating" index
        # test for eq and not-eq
        bitset = stream.bit_eq("rating", 4)
        bs = BitSet()
        bs.set(4)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_eq("rating", 3)
        bs = BitSet()
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_eq("rating", 5)
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ne("rating", 5)
        bs = BitSet()
        bs.set(4)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ne("rating", 3)
        bs = BitSet()
        for i in range(1, 6):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ne("rating", 4)
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        # test for less_than and less_eq
        bitset = stream.bit_ge("rating", 3)
        bs = BitSet()
        for i in range(1, 7):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_gt("rating", 3)
        bs = BitSet()
        for i in range(1, 6):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_le("rating", 3)
        bs = BitSet()
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_lt("rating", 3)
        bs = BitSet()
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_lt("rating", 5)
        bs = BitSet()
        bs.set(4)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_le("rating", 5)
        bs = BitSet()
        for i in range(1, 7):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_gt("rating", 5)
        bs = BitSet()
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ge("rating", 5)
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_le("rating", 4)
        bs = BitSet()
        bs.set(4)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_lt("rating", 4)
        bs = BitSet()
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ge("rating", 4)
        bs = BitSet()
        for i in range(1, 6):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_gt("rating", 4)
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        for i in bitset:
            self.assertTrue(i in bs)

        stream.close()
Example #11
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names, limit):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    from itertools import izip, repeat, islice, imap
    from sys import maxint
    from pyebset import BitSet

    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        msg = "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise util.DataError(msg, url)

    fle = util.localize(rest, disco_data=params._task.disco_data,
                        ddfs_data=params._task.ddfs_data)

    otab = None
    try:
        otab = MarbleStream(fle)
        bitmaps = {}

        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bm = where(otab)
                    if limit != maxint:
                        bs = BitSet()
                        for i in islice(bm, 0, limit):
                            bs.set(i)
                        bitmaps[index] = (bs, len(bs))
                    else:
                        bitmaps[index] = (bm, len(bm))
                else:
                    # it is either the table itself, or a partition expression.
                    # Either way, returns the entire table
                    if limit != maxint:
                        bs = BitSet()
                        for i in islice(otab.iter_all(), 0, limit):
                            bs.set(i)
                        bitmaps[index] = (bs, len(bs))
                    else:
                        bitmaps[index] = (otab.iter_all(), otab.number_rows)

        for index, (bitmap, blen) in bitmaps.iteritems():
            prefix_gen = [repeat(index, blen)] if gen_where_index else []

            # row_iter = prefix_gen + \
                # [otab.mget(col, bitmap) if col is not None else repeat(None, blen)
                 # for col in key_names[index]]
            row_creators = []
            for col, column_fn in key_names[index]:
                if col is not None:
                    if column_fn is None:
                        row_creators.append(otab.mget(col, bitmap))
                    else:
                        row_creators.append(imap(column_fn, otab.mget(col, bitmap)))
                else:
                    row_creators.append(repeat(None, blen))
            row_iter = prefix_gen + row_creators

            for row in izip(*row_iter):
                yield row, empty
    finally:
        if otab:
            otab.close()
Example #12
0
    def test_marble_stream_bit_ops(self):
        stream = MarbleStream(self.files["1992-10-03"])
        rowid = 1
        # test "name" index
        for album in self.albums:
            if album[_PARTITIONS] != "1992-10-03":
                continue
            bitset = stream.bit_eq("name", album["name"])
            bs = BitSet()
            bs.set(rowid)
            rowid += 1
            for i in bitset:
                self.assertTrue(i in bs)
        # test "genre" index
        bitset = stream.bit_eq("genre", "R&R")
        bs = BitSet()
        for i in range(1, 5):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        stream.close()

        stream = MarbleStream(self.files["1986-01-03"])
        rowid = 1
        # test "name" index
        for album in self.albums:
            if album[_PARTITIONS] != "1986-01-03":
                continue
            bitset = stream.bit_eq("name", album["name"])
            bs = BitSet()
            bs.set(rowid)
            rowid += 1
            for i in bitset:
                self.assertTrue(i in bs)
        # test "genre" index
        bitset = stream.bit_eq("genre", "SoundTrack")
        bs = BitSet()
        for i in range(1, 7):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        # test "rating" index
        # test for eq and not-eq
        bitset = stream.bit_eq("rating", 4)
        bs = BitSet()
        bs.set(4)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_eq("rating", 3)
        bs = BitSet()
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_eq("rating", 5)
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ne("rating", 5)
        bs = BitSet()
        bs.set(4)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ne("rating", 3)
        bs = BitSet()
        for i in range(1, 6):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ne("rating", 4)
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        # test "rating" index
        # test for eq_ex and not_eq_ex
        bitset = stream.bit_eq_ex("rating", [3, 4])
        bs = BitSet()
        bs.set(4)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_eq_ex("rating", [5])
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ne_ex("rating", [5])
        bs = BitSet()
        bs.set(4)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ne_ex("rating", [3, 4])
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        for i in bitset:
            self.assertTrue(i in bs)

        # test for less_than and less_eq
        bitset = stream.bit_ge("rating", 3)
        bs = BitSet()
        for i in range(1, 7):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_gt("rating", 3)
        bs = BitSet()
        for i in range(1, 6):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_le("rating", 3)
        bs = BitSet()
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_lt("rating", 3)
        bs = BitSet()
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_lt("rating", 5)
        bs = BitSet()
        bs.set(4)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_le("rating", 5)
        bs = BitSet()
        for i in range(1, 7):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_gt("rating", 5)
        bs = BitSet()
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ge("rating", 5)
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_le("rating", 4)
        bs = BitSet()
        bs.set(4)
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_lt("rating", 4)
        bs = BitSet()
        bs.set(6)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_ge("rating", 4)
        bs = BitSet()
        for i in range(1, 6):
            bs.set(i)
        for i in bitset:
            self.assertTrue(i in bs)

        bitset = stream.bit_gt("rating", 4)
        bs = BitSet()
        for i in range(1, 4):
            bs.set(i)
        bs.set(5)
        for i in bitset:
            self.assertTrue(i in bs)

        stream.close()