Ejemplo n.º 1
0
class Meta(object):
    """
    Stores metadata about labeled and intermediary datasets, including:

    - Dataset label
    - Function to apply
    - Buffer size

    It also is used for dataset removal.

    """
    def __init__(self, cache):
        self.label = "okmeta"
        self.cache = cache
        self.logger = Logger(self.label)

    def register(self, dsLabel, obj):
        self.logger.debug("Registering '%s'" % dsLabel)
        self.cache.hset(self.label, dsLabel, pickle_dumps(obj))

    def get(self, dsLabel):
        self.logger.debug("Getting '%s'" % dsLabel)
        return pickle.loads(self.cache.get(self.label, dsLabel))

    def createIntermediary(self, ds):
        """
        Create intermediary label
        append operating + fn to a list
        """
        self.logger.debug("Creating intermediary '%s'" % dsLabel)
        prefix = "%s_intermediary_" % ds.label

        timer = Timer()

        self.currentDsLabel = prefix + str(self.cache.incr(prefix))

        self.profiler.add("masterCache", timer.since())

        return self.currentDsLabel

    def remove(self, dsLabel):
        self.logger.debug("Removing '%s'" % dsLabel)
        self.cache.hdel(self.label, dsLabel)

        for k in self.cache.keys(dsLabel + "*"):
            self.cache.delete(dsLabel)

    def rename(self, label, newLabel):
        self.cache.hset(self.label, newLabel,
                        self.cache.get(self.label, label))
Ejemplo n.º 2
0
    def __init__(self, config, cache, bufferSize):
        self.cache = cache
        self.config = config
        self.logger = Logger("master")
        self.meta = Meta(self.cache)
        self.dataSets = {}

        localTimer = Timer()
        self.profiler = Profiler()
        """
        zmq init
        """
        try:
            raw_input
        except NameError:
            # Python 3
            raw_input = input

        zmqTimer = Timer()
        context = zmq.Context()

        # Sender
        self.sender = context.socket(zmq.PUSH)
        self.sender.bind("tcp://*:" +
                         str(self.config["cluster"]["master"]["port"]))
        self.logger.debug("Initialized sender socket")

        # Sink
        self.sink = context.socket(zmq.PULL)
        self.sink.bind("tcp://*:" +
                       str(self.config["cluster"]["return"]["port"]))
        self.logger.debug("Initialized sink socket")

        # Server
        self.server = context.socket(zmq.REP)
        self.server.bind("tcp://*:" + str(self.config["server"]["port"]))
        self.logger.debug("Initialized server socket")

        self.profiler.add("localZmq", zmqTimer.since())
Ejemplo n.º 3
0
#!/usr/bin/env python

from okdataset.clist import ChainableList
from okdataset.context import Context
from okdataset.logger import Logger

logger = Logger("map from existing example")

context = Context()
logger.info("Building dataset from existing")
ds = context.dataSet(label="big list", fromExisting=True)

logger.info("Calling map")
ds.map(lambda x: x * 2)
ds.map(lambda x: x + 3)
ds.compute()
logger.info("All done!")

Ejemplo n.º 4
0
    def __init__(self, cache, config, clist=None, label=None, fromExisting=False, bufferSize=None):
        self.cache = cache
        self.config = config
        self.meta = Meta(self.cache)
        self.label = label if label else "okds_%s" % uuid.uuid1()
        self.opsList = []
        
        localTimer = Timer()

        self.profiler = Profiler()
        
        if clist is None and not fromExisting:
            raise ValueError("Must provide either clist or fromExisting")

        if clist is not None and fromExisting:
            raise ValueError("Cannot provide both clist and fromExisting")

        if fromExisting and bufferSize is not None:
            raise ValueError("Cannot specify bufferSize for existing dataset")

        if fromExisting and label is None:
            raise ValueError("Must specify label for existing dataset")

        if clist is not None:
            self.dsLen = len(clist)
        
        if bufferSize is not None:
            self.bufferSize = bufferSize
        else:
            self.bufferSize = self.config["cache"]["io"]["bufferSize"]

        self.logger = Logger("dataset '" + self.label + "'")

        """
        Store the current working dataset label.  This will change as new intermediary
        datasets are created.
        """
        self.currentDsLabel = self.label
        self.currentIsIntermediary = False

        if fromExisting:
            self.dsLen = self.cache.len(self.label)
        else:
            # remove existing
            self.logger.debug("Removing existing")

            # Set total number of buffers
            self.buffers = self.dsLen / self.bufferSize
            self.buffers = self.buffers + 1 if self.dsLen % self.bufferSize > 0 else self.buffers

            for i in xrange(0, self.buffers + 1):
                start = self.bufferSize * i
                end = self.bufferSize * (i + 1)
                
                pickleTimer = Timer()
                buf = pickle_dumps(ChainableList(clist[start:end]))
                self.profiler.add("masterPickle", pickleTimer.since())

                cacheTimer = Timer()
                self.cache.pushBuffer(label, i, buf)
                self.profiler.add("masterCache", cacheTimer.since())
            
            self.logger.debug("Initialized with %d buffers" % self.buffers)
            self.logger.debug(json.dumps(self.profiler.toDict(), indent=2))
Ejemplo n.º 5
0
class DataSet(ChainableList):
    def __init__(self, cache, config, clist=None, label=None, fromExisting=False, bufferSize=None):
        self.cache = cache
        self.config = config
        self.meta = Meta(self.cache)
        self.label = label if label else "okds_%s" % uuid.uuid1()
        self.opsList = []
        
        localTimer = Timer()

        self.profiler = Profiler()
        
        if clist is None and not fromExisting:
            raise ValueError("Must provide either clist or fromExisting")

        if clist is not None and fromExisting:
            raise ValueError("Cannot provide both clist and fromExisting")

        if fromExisting and bufferSize is not None:
            raise ValueError("Cannot specify bufferSize for existing dataset")

        if fromExisting and label is None:
            raise ValueError("Must specify label for existing dataset")

        if clist is not None:
            self.dsLen = len(clist)
        
        if bufferSize is not None:
            self.bufferSize = bufferSize
        else:
            self.bufferSize = self.config["cache"]["io"]["bufferSize"]

        self.logger = Logger("dataset '" + self.label + "'")

        """
        Store the current working dataset label.  This will change as new intermediary
        datasets are created.
        """
        self.currentDsLabel = self.label
        self.currentIsIntermediary = False

        if fromExisting:
            self.dsLen = self.cache.len(self.label)
        else:
            # remove existing
            self.logger.debug("Removing existing")

            # Set total number of buffers
            self.buffers = self.dsLen / self.bufferSize
            self.buffers = self.buffers + 1 if self.dsLen % self.bufferSize > 0 else self.buffers

            for i in xrange(0, self.buffers + 1):
                start = self.bufferSize * i
                end = self.bufferSize * (i + 1)
                
                pickleTimer = Timer()
                buf = pickle_dumps(ChainableList(clist[start:end]))
                self.profiler.add("masterPickle", pickleTimer.since())

                cacheTimer = Timer()
                self.cache.pushBuffer(label, i, buf)
                self.profiler.add("masterCache", cacheTimer.since())
            
            self.logger.debug("Initialized with %d buffers" % self.buffers)
            self.logger.debug(json.dumps(self.profiler.toDict(), indent=2))


    def createIntermediary(self):
        prefix = self.label + "_intermediary_"
        
        timer = Timer()
        self.currentDsLabel = prefix + str(self.cache.incr(prefix))
        self.logger.debug("Creating intermediary '%s'" % self.currentDsLabel)
        self.profiler.add("masterCache", timer.since())

        self.currentIsIntermediary = True
        
        return self.currentDsLabel

    def flatMap(self, fn):
        self.opsList.append({ "method": "flatMap", "fn": fn })
        return self

    def map(self, fn):
        self.opsList.append({ "method": "map", "fn": fn })
        return self

    def filter(self, fn):
        self.opsList.append({ "method": "filter", "fn": fn })
        return self

    # XXX No distributed implementation yet.
    #def reduce(self, fn):
    #    self.opsList.append({ "method": "reduce", "fn": fn })
    #    return self.collect()

    # list items must be tuples of the form (key, ChainableList(values)) - like spark's LabeledPoint
    def reduceByKey(self, fn):
        res = ChainableList([])
        
        groups = ChainableList([ (key, ChainableList(group)) for key, group in groupby(sorted(self), lambda x: x[0]) ])\
            .map(lambda (key, items): (key, items.map(lambda x: x[1]))) 
        
        for key, values in groups:
            self.logger.trace(values)
            res.append((key, reduce(fn, values)))
        
        return res

    def collect(self):
        self.profiler = Profiler()
        localTimer = Timer()

        res = ChainableList([])

        for k in sorted(self.cache.getKeys(self.currentDsLabel), key=lambda x: int(x)):
            cacheTimer = Timer()
            buf = self.cache.get(self.currentDsLabel, k)
            self.profiler.add("collectCache", cacheTimer.since())

            res.extend(pickle.loads(buf))

        self.profiler.add("collectMaster", localTimer.since())

        if self.currentIsIntermediary:
            self.logger.debug("Removing intermediary" + self.currentDsLabel)
            self.meta.remove(self.currentDsLabel)

        self.currentDsLabel = self.label

        return res


    def getProfile(self, f=None):
        if f:
            f(self.profiler.toDict())
        else:
            return self.profiler.toDict()

    def label(self, label):
        self.meta.rename(self.currentDsLabel, label)
        self.currentDsLabel = label
        self.currentIsIntermediary = False

    def __del__(self):
        self.meta.remove(self.currentDsLabel)
Ejemplo n.º 6
0
class Master(ChainableList):
    """
    Master is both the client-facing server process and also the grid controller.
    """
    def __init__(self, config, cache, bufferSize):
        self.cache = cache
        self.config = config
        self.logger = Logger("master")
        self.meta = Meta(self.cache)
        self.dataSets = {}

        localTimer = Timer()
        self.profiler = Profiler()
        """
        zmq init
        """
        try:
            raw_input
        except NameError:
            # Python 3
            raw_input = input

        zmqTimer = Timer()
        context = zmq.Context()

        # Sender
        self.sender = context.socket(zmq.PUSH)
        self.sender.bind("tcp://*:" +
                         str(self.config["cluster"]["master"]["port"]))
        self.logger.debug("Initialized sender socket")

        # Sink
        self.sink = context.socket(zmq.PULL)
        self.sink.bind("tcp://*:" +
                       str(self.config["cluster"]["return"]["port"]))
        self.logger.debug("Initialized sink socket")

        # Server
        self.server = context.socket(zmq.REP)
        self.server.bind("tcp://*:" + str(self.config["server"]["port"]))
        self.logger.debug("Initialized server socket")

        self.profiler.add("localZmq", zmqTimer.since())

    def compute(self, label, intermediaryLabel, opsList):
        self.logger.debug("Starting compute on %s" % label)

        self.profiler = Profiler()
        localTimer = Timer()

        cacheTimer = Timer()
        keys = self.cache.getKeys(label)
        self.profiler.add("computeCache", cacheTimer.since())

        self.logger.debug("Got %d keys" % len(keys))

        source = label
        dest = intermediaryLabel

        self.meta.register(dest, {
            "opsList": opsList,
            "buffers": len(keys),
            "isIntermediary": True
        })

        for key in keys:
            self.logger.trace("Sending key %s" % key)

            pickleTimer = Timer()
            msg = pickle_dumps({
                "offset": key,
                "sourceLabel": source,
                "destLabel": dest
            })
            self.profiler.add("computePickle", pickleTimer.since())

            zmqTimer = Timer()
            self.sender.send(msg)
            self.profiler.add("computeZmq", zmqTimer.since())

        results = 0

        while results != len(keys) - 1:
            self.logger.trace("Received %d out of %d results" %
                              (results, len(keys) - 1))

            zmqTimer = Timer()

            res = self.sink.recv_pyobj()

            self.profiler.add("computeZmq", zmqTimer.since())
            self.profiler.append(res["profiler"])

            results = results + 1

        self.profiler.add("computeOverall", localTimer.since())

        self.logger.info("compute complete")
        self.logger.debug(json.dumps(self.profiler.toDict(), indent=2))

        return self

    def getProfile(self, fn=None):
        if fn:
            fn(self.profiler.toDict())
        else:
            return self.profiler.toDict()

    def mainLoop(self):
        while True:
            self.logger.debug("Receiving")
            req = pickle.loads(self.server.recv())
            data = req.get("data")

            if req["method"] == "create":
                self.logger.debug("create called for dataset %s, id %s" %
                                  (req["data"]["label"], req["id"]))
                ds = DataSet(self.cache,
                             self.config,
                             data["clist"],
                             label=data["label"],
                             fromExisting=data["fromExisting"],
                             bufferSize=data["bufferSize"])
                print(ds.currentDsLabel)
                self.dataSets[req["id"]] = ds
                self.server.send(pickle_dumps({"status": "ok"}))

            elif req["method"] in ["map", "flatMap", "reduceByKey", "filter"]:
                ds = self.dataSets[req["id"]]
                self.logger.debug(
                    "%s called for dataset %s, id %s" %
                    (req["method"], ds.currentDsLabel, req["id"]))

                getattr(ds, req["method"])(data)
                self.server.send(pickle_dumps({"status": "ok"}))

            elif req["method"] in ["collect", "reduce"]:
                ds = self.dataSets[req["id"]]
                self.logger.debug(
                    "%s called for dataset %s, id %s" %
                    (req["method"], ds.currentDsLabel, req["id"]))

                self.compute(ds.currentDsLabel, ds.createIntermediary(),
                             ds.opsList)
                res = ds.collect()

                if req["method"] == "reduce":
                    res = res.reduce(data)

                self.server.send(pickle_dumps({"status": "ok", "data": res}))

            elif req["method"] == "compute":
                ds = self.dataSets[req["id"]]
                self.logger.debug("compute called for dataset %s, id %s" %
                                  (ds.currentDsLabel, req["id"]))

                self.compute(ds.currentDsLabel, ds.createIntermediary(),
                             ds.opsList)
                self.server.send(pickle_dumps({"status": "ok"}))

            else:
                self.logger.debug("unknown method %s" % req["method"])
Ejemplo n.º 7
0
#!/usr/bin/env python

from okdataset.clist import ChainableList
from okdataset.context import Context
from okdataset.logger import Logger

logger = Logger("flatmaparray example")

context = Context()
logger.info("Building big list")
l = ChainableList([ x for x in xrange(0, 100) ])

logger.info("Building dataset")
ds = context.dataSet(l, label="flatMap list", bufferSize=1)

logger.info("Calling flatMap")

def fm(x):
    for i in [ "a", "b", "c" ]:
        yield [ x, i ]

res = ds.flatMap(fm).collect()
print(res)
logger.info("All done!")

Ejemplo n.º 8
0
 def __init__(self, cache):
     self.label = "okmeta"
     self.cache = cache
     self.logger = Logger(self.label)
Ejemplo n.º 9
0
#!/usr/bin/env python

from okdataset.clist import ChainableList
from okdataset.context import Context
from okdataset.dataset import DataSet
from okdataset.logger import Logger

logger = Logger("maparray example")

logger.info("Building big list")
l = ChainableList([1, 1, 1, 2, 2, 3, 6, 9, 9, 9, 12])

logger.info("Building dataset")
context = Context()
ds = context.dataSet(l, bufferSize=1)

a = 1
logger.info("Calling chain")
ds.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

print ds.collect()
Ejemplo n.º 10
0
    def __init__(self, config, cache):
        self.logger = Logger("worker")
        self.offsets = {}

        meta = Meta(cache)

        self.currentDestLabel = ""
        self.opsList = None

        cluster = config["cluster"]

        context = zmq.Context()

        receiver = context.socket(zmq.PULL)
        receiver.connect("tcp://" + cluster["master"]["host"] + ":" +
                         str(cluster["master"]["port"]))
        self.logger.debug("Initialized receiver socket")

        returner = context.socket(zmq.PUSH)
        returner.connect("tcp://" + cluster["return"]["host"] + ":" +
                         str(cluster["return"]["port"]))
        self.logger.debug("Initialized returner socket")

        self.logger.info("Worker initialized")

        while True:
            profiler = Profiler()
            local = Timer()

            zmqTimer = Timer()
            msg = receiver.recv()
            profiler.add("workerZmq", zmqTimer.since())

            pickleTimer = Timer()
            msg = pickle.loads(msg)
            profiler.add("workerPickle", pickleTimer.since())
            self.logger.trace("Received message: " + str(msg))

            if self.currentDestLabel != msg["destLabel"]:
                self.opsList = meta.get(msg["destLabel"])["opsList"]
                self.currentDestLabel = msg["destLabel"]

            cacheTimer = Timer()
            buf = cache.get(msg["sourceLabel"], msg["offset"])
            profiler.add("workerCache", cacheTimer.since())

            pickleTimer = Timer()
            buf = pickle.loads(buf)
            profiler.add("workerPickle", pickleTimer.since())

            self.logger.trace("Received buffer")

            res = buf
            for op in self.opsList:
                res = getattr(res, op["method"])(op["fn"])

            self.logger.trace("Processed buffer")

            reply = {
                "destLabel": msg["destLabel"],
                "offset": msg["offset"],
                "status": "ok",
                "profiler": profiler
            }

            # in case of flatMap
            if len(res) != len(buf):
                reply["size"] = len(res)

            pickleTimer = Timer()
            res = pickle_dumps(res)
            profiler.add("workerPickle", pickleTimer.since())

            cacheTimer = Timer()
            cache.pushBuffer(msg["destLabel"], msg["offset"], res)
            profiler.add("workerCache", cacheTimer.since())

            self.logger.trace("Processed buffer")

            profiler.add("workerOverall", local.since())

            returner.send_pyobj(reply)
            self.logger.trace("Reply sent")