Beispiel #1
0
    def __init__(self, fconf, handler):
        """
        Initialize a MasterServer instance
        @param fconf the path to the configuration file
        @param handler the handler object in charge of managing HTTP requests
        """
        Logger.__init__(self, "Manager")

        conf = json.load(open(fconf))

        # Jinja2 initialization.
        tmpl_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                 'templates')
        self.env = Environment(loader=FileSystemLoader(tmpl_path))
        self.status = ApplicationStatus()

        # This is a dictionary structure in the form
        # reduce_dict["group-name"] = [
        #   [ file list by unique integers, size in byte
        #   ] => Reduce-0
        #   [
        #   ] => Reduce-1
        # ]
        self.reduce_mark = set()
        self.reduce_dict = defaultdict(list)
        self.dead_reduce_dict = defaultdict(list)

        # This is a dictionary nick => Handler instance
        self.masters = {}
        self.last_id = -1
        self.pending_works = defaultdict(list) # nick => [work, ...]

        self.ping_max = int(conf["ping-max"])
        self.ping_interval = int(conf["ping-interval"])
        self.num_reducer = int(conf["num-reducer"])

        # This will just keep track of the name of the files
        self.reduce_files = []
        self.results_printed = False

        for _ in range(self.num_reducer):
            self.reduce_files.append("N/A")

        # Load the input module and assing the generator to the work_queue
        module = load_module(conf["input-module"])
        cls = getattr(module, "Input", None)

        # Some code for the DFS
        generator = cls(fconf).input()
        self.use_dfs = use_dfs = conf['dfs-enabled']

        if use_dfs:
            dfsconf = conf['dfs-conf']
            dfsconf['host'] = dfsconf['master']

            self.path = conf['output-prefix']
        else:
            dfsconf = None

            self.path = os.path.join(
                os.path.join(conf['datadir'], conf['output-prefix'])
            )

        self.work_queue = WorkQueue(self.logger, generator, use_dfs, dfsconf)

        # Lock to synchronize access to the timestamps dictionary
        self.lock = Lock()
        self.timestamps = {} # nick => (send_ts:enum, ts:float)

        # Ping thread
        self.hb_thread = Thread(target=self.hearthbeat)

        # Event to mark the end of the server
        self.finished = Event()

        self.addrinfo = (conf['master-host'], conf['master-port'])
        Server.__init__(self, self.addrinfo[0], self.addrinfo[1], handler)
Beispiel #2
0
class MasterServer(Server, Logger):
    def __init__(self, fconf, handler):
        """
        Initialize a MasterServer instance
        @param fconf the path to the configuration file
        @param handler the handler object in charge of managing HTTP requests
        """
        Logger.__init__(self, "Manager")

        conf = json.load(open(fconf))

        # Jinja2 initialization.
        tmpl_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                 'templates')
        self.env = Environment(loader=FileSystemLoader(tmpl_path))
        self.status = ApplicationStatus()

        # This is a dictionary structure in the form
        # reduce_dict["group-name"] = [
        #   [ file list by unique integers, size in byte
        #   ] => Reduce-0
        #   [
        #   ] => Reduce-1
        # ]
        self.reduce_mark = set()
        self.reduce_dict = defaultdict(list)
        self.dead_reduce_dict = defaultdict(list)

        # This is a dictionary nick => Handler instance
        self.masters = {}
        self.last_id = -1
        self.pending_works = defaultdict(list) # nick => [work, ...]

        self.ping_max = int(conf["ping-max"])
        self.ping_interval = int(conf["ping-interval"])
        self.num_reducer = int(conf["num-reducer"])

        # This will just keep track of the name of the files
        self.reduce_files = []
        self.results_printed = False

        for _ in range(self.num_reducer):
            self.reduce_files.append("N/A")

        # Load the input module and assing the generator to the work_queue
        module = load_module(conf["input-module"])
        cls = getattr(module, "Input", None)

        # Some code for the DFS
        generator = cls(fconf).input()
        self.use_dfs = use_dfs = conf['dfs-enabled']

        if use_dfs:
            dfsconf = conf['dfs-conf']
            dfsconf['host'] = dfsconf['master']

            self.path = conf['output-prefix']
        else:
            dfsconf = None

            self.path = os.path.join(
                os.path.join(conf['datadir'], conf['output-prefix'])
            )

        self.work_queue = WorkQueue(self.logger, generator, use_dfs, dfsconf)

        # Lock to synchronize access to the timestamps dictionary
        self.lock = Lock()
        self.timestamps = {} # nick => (send_ts:enum, ts:float)

        # Ping thread
        self.hb_thread = Thread(target=self.hearthbeat)

        # Event to mark the end of the server
        self.finished = Event()

        self.addrinfo = (conf['master-host'], conf['master-port'])
        Server.__init__(self, self.addrinfo[0], self.addrinfo[1], handler)

    def run(self):
        "Start the server"

        # Just redirects every message logged to the application status object
        # in order to make it available through the web interface
        self.logger.addHandler(PushHandler(self.status.push_log))

        if self.work_queue.use_dfs:
            self.info("Starting Distributed Filesystem")
            self.work_queue.fs.start()

        self.info("Server started on http://%s:%d" % self.addrinfo)
        self.hb_thread.start()
        Server.run(self)

    def stop(self):
        "Stop the server"
        self.finished.set()

        if self.work_queue.use_dfs:
            self.work_queue.fs.stop()

    def retrieve_file(self, nick, reduce_idx, file):
        fid, fsize = file
        fname = get_file_name(self.path, reduce_idx, fid)
        self.reduce_files[reduce_idx] = (nick, fname, fsize)

    def print_results(self):
        if self.results_printed:
            return

        if not self.reduce_mark and not self.dead_reduce_dict and \
           self.status.phase == self.status.PHASE_MERGE:

            self.results_printed = True

            for nick, fname, fsize in self.reduce_files:
                self.info("Group %s produced %s [%d bytes] output file" % \
                          (nick, fname, fsize))

    def on_group_died(self, nick, is_error):
        """
        Called whenever a master disconnected from the server
        @param nick the nick of the master dying
        @param is_error a boolean indicating if this was an abnormal error or
                        whether the socket was safely shutted down.
        """
        # NB: Possibly we can restart the master through a bash script or
        # provide to the final user an overridable method in order to manage
        # the situation and apply different policies.

        self.status.update_master_status(nick, {'status': 'dead'})
        self.status.faults += 1

        # Remove any pending map activity
        lst = self.pending_works[nick]
        del self.pending_works[nick]

        for wstatus in lst:
            self.status.map_faulted += 1
            self.work_queue.push(wstatus.state)

        # Remove any pending reduce activity
        lst = self.reduce_dict[nick]

        if lst: # This might be None
            self.dead_reduce_dict[nick] = lst
            for reducer_lst in lst:
                if reducer_lst:
                    self.status.reduce_faulted += 1

        del self.reduce_dict[nick]
        del self.masters[nick]

    def hearthbeat(self):
        """
        This method is executed in an external thread namely the hearthbeat
        thread. The aim of the code is to periodically ping all the masters.
        """
        while not self.finished.is_set():
            with self.lock:
                for nick in self.masters:
                    self.timestamps[nick] = (PING_EXECUTE, 0)

            time.sleep(self.ping_interval)

            # Here we do not do anything if a given master overflows a specific
            # the specified limit but just warn the user about the violation.

            with self.lock:
                for nick in self.masters:
                    status, rtt = self.timestamps.get(nick, (None, None))

                    if status is not None and rtt > self.ping_max:
                        self.warning(
                            "RTT for %s is above the limit (%.2f > %.2f)" % \
                            (nick, rtt, self.ping_max)
                        )