Example #1
0
class Worker(Logger):
    def __init__(self, fconf, master_id, worker_id):
        super(Worker, self).__init__("Worker")

        finished = False
        self.comm = MPI.COMM_WORLD.Get_parent()

        # Here we also need to handle the configuration file somehow
        self.conf = conf = json.load(open(fconf))
        self.use_dfs = use_dfs = conf['dfs-enabled']

        self.datadir = self.conf['datadir']
        self.input_prefix = self.conf['input-prefix']
        self.output_prefix = self.conf['output-prefix']

        if use_dfs and not DFS_AVAILABLE:
            raise Exception("You need to install fsdfs in order to use the" \
                            " distributed mode. Otherwise just toggle it "  \
                            "off from the configuration file")
        elif use_dfs:
            dconf = conf['dfs-conf']

            host = '%s:%d' % (conf['dfs-host'],
                              conf['dfs-startport'] + int(worker_id))

            self.datadir = os.path.join(
                self.datadir,
                'master-{:06d}'.format(int(master_id)),
                'worker-{:06d}'.format(int(worker_id))
            )
            self.info("Creating directory structure in %s" % self.datadir)

            if os.path.exists(self.datadir):
                shutil.rmtree(self.datadir)

            os.makedirs(self.datadir)
            os.makedirs(os.path.join(self.datadir, self.input_prefix))
            os.makedirs(os.path.join(self.datadir, self.output_prefix))

            dconf['host'] = host
            dconf['datadir'] = self.datadir

            self.info('Starting DFS client on %s' % host)

            self.fs = Filesystem(dconf)
            self.fs.start()

        # Here we need somehow to override the default scheme in case of DFS
        conf['datadir'] = self.datadir

        self.master_id = int(master_id)
        self.worker_id = int(worker_id)

        self.mapper = self.extract_cls(conf['map-module'], 'Mapper')(conf)
        self.reducer = self.extract_cls(conf['reduce-module'], 'Reducer')(conf)

        # We provide a VFS abstraction
        self.mapper.setup(self)
        self.reducer.setup(self)

        while not finished:
            self.comm.send(Message(MSG_AVAILABLE, 0, None), dest=0)
            msg = self.comm.recv()

            if msg.command == MSG_COMPUTE_MAP:
                info, result = self.mapper.execute(msg.result)

                msg = Message(MSG_FINISHED_MAP, msg.tag, result)
                msg.info = info

                self.info("Map performance: %.2f" % \
                           (info[0] / (1024 ** 2 * info[1])))

                self.comm.send(msg, dest=0)

            elif msg.command == MSG_COMPUTE_REDUCE:
                info, result = self.reducer.execute(msg.result)

                msg = Message(MSG_FINISHED_REDUCE, msg.tag, result)
                msg.info = info

                self.info("Reduce performance: %.2f" % \
                          (info[0] / (1024 ** 2 * info[1])))

                self.comm.send(msg, dest=0)

            elif msg.command == MSG_SLEEP:
                time.sleep(msg.result)

            elif msg.command == MSG_QUIT:
                finished = True

        if self.use_dfs:
            self.info("Stopping DFS client")
            self.fs.stop()
            self.info("Stopped")

    def extract_cls(self, mname, fname):
        module = load_module(mname)
        return getattr(module, fname)

    def pull_remote_files(self, reduce_idx, file_ids):
        """
        Pull a set of files from the global DFS
        @param reduce_idx the reducer ID
        @param file_ids an iterable object containing integers (they will be
                        casted to int())
        """
        if not self.use_dfs:
            return

        for fileid in file_ids:
            fname = "output-r{:06d}-p{:018d}".format(reduce_idx, int(fileid))
            fname = os.path.join(self.output_prefix, fname)

            full_path = os.path.join(self.datadir, fname)
            self.info("Checking %s" % full_path)

            if os.path.exists(full_path):
                self.info("Skipping %s. It is already present" % fname)
                continue

            self.info("Worker worker_id=%d is downloading file '%s'" % \
                      (self.worker_id, fname))

            downloaded = False

            while not downloaded:
                try:
                    downloaded = self.fs.downloadFile(fname)
                except:
                    self.info("Failed to download %s. Retrying in 2 sec" % fname)
                    sleep(2)

    def pull_remote_file(self, inp):
        """
        Pull a file from the global DFS
        @param inp a tuple in the form (file name, file id)
        """
        filename, fileid = inp

        if self.use_dfs:
            self.info("Worker worker_id=%d is downloading file '%s'" % \
                      (self.worker_id, filename))
            downloaded = False

            while not downloaded:
                try:
                    downloaded = self.fs.downloadFile(filename)
                except:
                    self.info("Failed to download %s. Retrying in 2 sec" % fname)
                    sleep(2)

        return (os.path.join(self.datadir, filename), fileid)

    def push_local_file(self, fname, push=False):
        """
        Push a local file into the global DFS
        @param fname the file to import
        @param push if True the file will be pushed on the master.
        """
        if not self.use_dfs:
            return

        fname = os.path.join(self.output_prefix, fname)
        self.info("Pushing file '%s' into global DFS" % fname)
        self.fs.importFile(os.path.join(self.datadir, fname), fname)

        if push:
            ret = self.fs.pushFile(fname)
            self.info("Pushing returned %s" % str(ret))
Example #2
0
class WorkQueue(object):
    """
    The object is able to merge a generator and a queue and trasparently expose
    a simple interface for retrieving objects. The generator is prioritized
    with respect to the queue, which is used as a backup in some sense.
    """
    def __init__(self, logger, gen, use_dfs=False, dfs_conf=None):
        """
        Initialize a WorkQueue instance
        @param logger a logger object
        @param gen a generator
        @param use_dfs boolean indicating whether to use a DFS or not
        @param dfs_conf a dictionary containing the necessary parameters for
                        the DFS Master initialization.
        """
        self.logger = logger
        self.generator = gen
        self.dead_queue = []
        self.last_tag = 0
        self.use_dfs = use_dfs

        if use_dfs and not DFS_AVAILABLE:
            raise Exception("You need to install fsdfs in order to use the" \
                            " distributed mode. Otherwise just toggle it "  \
                            "off from the configuration file")
        elif use_dfs:
            self.datadir = dfs_conf['datadir']
            self.fs = Filesystem(dfs_conf)

    def push(self, item):
        """
        Push an item in the dead_queue. Please beware that it is not possible
        to push None objects
        @param item the item you want to push
        """
        if item is None:
            raise ValueError("Cannot push None in the queue")

        self.dead_queue.append(item)

    def next(self):
        """
        Extract the next value from the WorkQueue
        @return an object or None if the retrieve is not possible
        """
        self.last_tag += 1

        try:
            # Here value is a tuple (path to file, file id)
            fname, fid = self.generator.next()

            if self.use_dfs:
                self.logger.info("Publishing file '%s' from '%s'" % \
                                 (fname, self.datadir))

                self.fs.importFile(os.path.join(self.datadir, fname), fname)

            return WorkerStatus(TYPE_MAP, self.last_tag, (fname, fid))

        except StopIteration:
            if self.dead_queue:
                value = self.dead_queue.pop(0)
                return WorkerStatus(TYPE_MAP, self.last_tag, value)
            else:
                return None

    def pop(self):
        """
        An alias for the next method
        @return an object
        """
        return self.next()