class Worker(Logger): def __init__(self, fconf, master_id, worker_id): super(Worker, self).__init__("Worker") finished = False self.comm = MPI.COMM_WORLD.Get_parent() # Here we also need to handle the configuration file somehow self.conf = conf = json.load(open(fconf)) self.use_dfs = use_dfs = conf['dfs-enabled'] self.datadir = self.conf['datadir'] self.input_prefix = self.conf['input-prefix'] self.output_prefix = self.conf['output-prefix'] if use_dfs and not DFS_AVAILABLE: raise Exception("You need to install fsdfs in order to use the" \ " distributed mode. Otherwise just toggle it " \ "off from the configuration file") elif use_dfs: dconf = conf['dfs-conf'] host = '%s:%d' % (conf['dfs-host'], conf['dfs-startport'] + int(worker_id)) self.datadir = os.path.join( self.datadir, 'master-{:06d}'.format(int(master_id)), 'worker-{:06d}'.format(int(worker_id)) ) self.info("Creating directory structure in %s" % self.datadir) if os.path.exists(self.datadir): shutil.rmtree(self.datadir) os.makedirs(self.datadir) os.makedirs(os.path.join(self.datadir, self.input_prefix)) os.makedirs(os.path.join(self.datadir, self.output_prefix)) dconf['host'] = host dconf['datadir'] = self.datadir self.info('Starting DFS client on %s' % host) self.fs = Filesystem(dconf) self.fs.start() # Here we need somehow to override the default scheme in case of DFS conf['datadir'] = self.datadir self.master_id = int(master_id) self.worker_id = int(worker_id) self.mapper = self.extract_cls(conf['map-module'], 'Mapper')(conf) self.reducer = self.extract_cls(conf['reduce-module'], 'Reducer')(conf) # We provide a VFS abstraction self.mapper.setup(self) self.reducer.setup(self) while not finished: self.comm.send(Message(MSG_AVAILABLE, 0, None), dest=0) msg = self.comm.recv() if msg.command == MSG_COMPUTE_MAP: info, result = self.mapper.execute(msg.result) msg = Message(MSG_FINISHED_MAP, msg.tag, result) msg.info = info self.info("Map performance: %.2f" % \ (info[0] / (1024 ** 2 * info[1]))) self.comm.send(msg, dest=0) elif msg.command == MSG_COMPUTE_REDUCE: info, result = self.reducer.execute(msg.result) msg = Message(MSG_FINISHED_REDUCE, msg.tag, result) msg.info = info self.info("Reduce performance: %.2f" % \ (info[0] / (1024 ** 2 * info[1]))) self.comm.send(msg, dest=0) elif msg.command == MSG_SLEEP: time.sleep(msg.result) elif msg.command == MSG_QUIT: finished = True if self.use_dfs: self.info("Stopping DFS client") self.fs.stop() self.info("Stopped") def extract_cls(self, mname, fname): module = load_module(mname) return getattr(module, fname) def pull_remote_files(self, reduce_idx, file_ids): """ Pull a set of files from the global DFS @param reduce_idx the reducer ID @param file_ids an iterable object containing integers (they will be casted to int()) """ if not self.use_dfs: return for fileid in file_ids: fname = "output-r{:06d}-p{:018d}".format(reduce_idx, int(fileid)) fname = os.path.join(self.output_prefix, fname) full_path = os.path.join(self.datadir, fname) self.info("Checking %s" % full_path) if os.path.exists(full_path): self.info("Skipping %s. It is already present" % fname) continue self.info("Worker worker_id=%d is downloading file '%s'" % \ (self.worker_id, fname)) downloaded = False while not downloaded: try: downloaded = self.fs.downloadFile(fname) except: self.info("Failed to download %s. Retrying in 2 sec" % fname) sleep(2) def pull_remote_file(self, inp): """ Pull a file from the global DFS @param inp a tuple in the form (file name, file id) """ filename, fileid = inp if self.use_dfs: self.info("Worker worker_id=%d is downloading file '%s'" % \ (self.worker_id, filename)) downloaded = False while not downloaded: try: downloaded = self.fs.downloadFile(filename) except: self.info("Failed to download %s. Retrying in 2 sec" % fname) sleep(2) return (os.path.join(self.datadir, filename), fileid) def push_local_file(self, fname, push=False): """ Push a local file into the global DFS @param fname the file to import @param push if True the file will be pushed on the master. """ if not self.use_dfs: return fname = os.path.join(self.output_prefix, fname) self.info("Pushing file '%s' into global DFS" % fname) self.fs.importFile(os.path.join(self.datadir, fname), fname) if push: ret = self.fs.pushFile(fname) self.info("Pushing returned %s" % str(ret))
class WorkQueue(object): """ The object is able to merge a generator and a queue and trasparently expose a simple interface for retrieving objects. The generator is prioritized with respect to the queue, which is used as a backup in some sense. """ def __init__(self, logger, gen, use_dfs=False, dfs_conf=None): """ Initialize a WorkQueue instance @param logger a logger object @param gen a generator @param use_dfs boolean indicating whether to use a DFS or not @param dfs_conf a dictionary containing the necessary parameters for the DFS Master initialization. """ self.logger = logger self.generator = gen self.dead_queue = [] self.last_tag = 0 self.use_dfs = use_dfs if use_dfs and not DFS_AVAILABLE: raise Exception("You need to install fsdfs in order to use the" \ " distributed mode. Otherwise just toggle it " \ "off from the configuration file") elif use_dfs: self.datadir = dfs_conf['datadir'] self.fs = Filesystem(dfs_conf) def push(self, item): """ Push an item in the dead_queue. Please beware that it is not possible to push None objects @param item the item you want to push """ if item is None: raise ValueError("Cannot push None in the queue") self.dead_queue.append(item) def next(self): """ Extract the next value from the WorkQueue @return an object or None if the retrieve is not possible """ self.last_tag += 1 try: # Here value is a tuple (path to file, file id) fname, fid = self.generator.next() if self.use_dfs: self.logger.info("Publishing file '%s' from '%s'" % \ (fname, self.datadir)) self.fs.importFile(os.path.join(self.datadir, fname), fname) return WorkerStatus(TYPE_MAP, self.last_tag, (fname, fid)) except StopIteration: if self.dead_queue: value = self.dead_queue.pop(0) return WorkerStatus(TYPE_MAP, self.last_tag, value) else: return None def pop(self): """ An alias for the next method @return an object """ return self.next()