Exemple #1
0
    def _findRemoteProcessIDs(self, processes):
        remote_process_ids = {}
        def parser(line, find_process):
            debug(line)
            key = find_process.name
            remote_process_ids[key] = int(line.split()[0])
       
        res = True
        num_attempts = 0
        max_num_attempts = 3
        while len(remote_process_ids) < len(processes):
            find_processes = []
            for process in processes:
                if process.name in remote_process_ids:
                    continue
                find_process = executeRemoteCommand([process.server],  "ps --no-headers -o\"pid,args\" | grep -e '^ *[0-9]\+ %s$'" % process.tf_command)[0]
                find_process.name = process.name
                find_processes.append(find_process)
                
            waitForProcesses(find_processes, wait_timeout=5, on_output=parser)
            time.sleep(1)
            num_attempts += 1
            if num_attempts == max_num_attempts:
                error("Failed to find remote process IDs. Most likely some processes failed to run.")
                res = False
                break

        table = FormattedTable()
        table.border_style = UniBorder.BORDER_STYLE_SINGLE
        table.addColumn(FormattedTable.Column("IP"))
        table.addColumn(FormattedTable.Column("Job"))
        table.addColumn(FormattedTable.Column("#"))
        table.addColumn(FormattedTable.Column("PID"))
        table.addColumn(FormattedTable.Column("RPID"))
        table.addColumn(FormattedTable.Column("Flags"))
        table.addColumn(FormattedTable.Column("Command"))
        for process in processes:
            if process.name in remote_process_ids:
                process.remote_pid = remote_process_ids[process.name]
            else:
                process.remote_pid = -1
            table.addRow([process.server_info.ip, process.job_name, process.task_id, process.instance.pid, process.remote_pid, process.tf_flags, process.tf_command])
        table.printFormatted(LogWriter(None, LOG_LEVEL_NOTE))
        return res
Exemple #2
0
 def _runCommand(self, cmd, servers, wait_timeout, on_output, on_process_start, on_process_done, factory):
     processes = []
     if servers is None:
         processes.append(executeCommand(cmd, factory=factory))
     else:
         processes.extend(executeRemoteCommand(servers, cmd, factory=factory))
     return waitForProcesses(processes,
                             wait_timeout=wait_timeout,
                             on_output=on_output,
                             on_process_start=on_process_start,
                             on_process_done=on_process_done)
Exemple #3
0
    def _getDevices(self, ips):
        links = []
        self._devices = {}
        
        def linkNameParser(line, process):
            debug(line)
            links.append(line)
        
        def deviceNameAndPortParser(line, process):
            debug(line)
            parts = line.split()
            res = RemoteDeviceInfo()
            res.name = parts[0]
            res.port = int(parts[2])
            res.is_up = parts[5] == "(Up)"
            self._devices[process.server] = res

        procs = []
        for ip in ips:
            server = TestEnvironment.Get().getServer(ip)
            procs.extend(executeRemoteCommand([server], "ip -o a s | grep %s | cut -d ' ' -f 2 | cut -d'.' -f1" % ip))
        if not waitForProcesses(procs, wait_timeout=5, on_output=linkNameParser):
            for proc in procs:
                if proc.exception is not None:
                    raise proc.exception
            raise Exception("Internal Error")
        
        procs = []
        for ip in ips:
            server = TestEnvironment.Get().getServer(ip)
            i = len(procs)
            link = links[i]
            procs.extend(executeRemoteCommand([server], "ibdev2netdev | grep %s" % link))
        if not waitForProcesses(procs, wait_timeout=5, on_output=deviceNameAndPortParser):
            for proc in procs:
                if proc.exception is not None:
                    raise proc.exception
            raise Exception("Internal Error")
Exemple #4
0
    def perform(self, index):
        Step.perform(self, index)

        ##########
        # Build: #
        ##########
        title("Building:", UniBorder.BORDER_STYLE_SINGLE)
        config_cuda = "--config=cuda" if self.config_cuda else ""
        if self.additional_flags == [""]:
            additional_flags = ""
        else:
            additional_flags = "--copt \"%s\"" % " ".join(
                self.additional_flags)
        cmd = "cd %s; rm -rf tensorflow_pkg; bazel build -c opt %s %s //tensorflow/tools/pip_package:build_pip_package" % (
            self.tensorflow_home, config_cuda, additional_flags)
        res = self.runSeperate(cmd,
                               title="Build %s" % self.tensorflow_home,
                               log_file_path=os.path.join(
                                   self._logs_dir, "build.log"),
                               wait_timeout=3600)
        if not res:
            return False

        cmd = "cd %s; bazel-bin/tensorflow/tools/pip_package/build_pip_package tensorflow_pkg" % (
            self.tensorflow_home)
        res = self.runInline(cmd, wait_timeout=60)
        if not res:
            return False

        ############
        # Install: #
        ############
        servers = TestEnvironment.Get().getServers(self.install_servers)
        title("Installing:", UniBorder.BORDER_STYLE_SINGLE)
        src_dir = os.path.join(self.tensorflow_home, "tensorflow_pkg")
        temp_dir_name = "tmp." + next(tempfile._get_candidate_names()) + next(
            tempfile._get_candidate_names())
        temp_dir = os.path.join(tempfile._get_default_tempdir(), temp_dir_name)
        res = self.runSCP(servers, [src_dir], temp_dir, wait_timeout=10)
        if not res:
            return False

        cmd = "pip install --user --upgrade %s/tensorflow-*" % temp_dir
        process_title = lambda process: "Installing on %s..." % process.server
        log_file_path = lambda process: os.path.join(
            self._logs_dir, "install_%s.log" % re.sub("[^0-9a-zA-Z]", "_",
                                                      process.server))
        res = self.runSeperate(cmd,
                               title=process_title,
                               servers=servers,
                               log_file_path=log_file_path)
        if not res:
            return False

        ##########
        # Clean: #
        ##########
        title("Cleaning:", UniBorder.BORDER_STYLE_SINGLE)
        processes = executeRemoteCommand(servers, "rm -rf %s" % temp_dir)
        res = waitForProcesses(processes, wait_timeout=10)
        if not res:
            return False
        return True
Exemple #5
0
    def perform(self, index):
        Step.perform(self, index)
        log("<img src='%s' width=600 style='border:1px solid black'/>" % pkg_resources.resource_filename("mltester", "images/tensorflow.jpg")) #https://www.skylinelabs.in/blog/images/tensorflow.jpg?width=500'/>")
        for attr in self._attributes:
            log(" + %s: %s" % (attr.desc.display_name, str(attr.val)))
        self._perf = TFPerformanceMeasurements()
        self._processes = []
        self._stopping = False
        self._servers = {}
        work_dir_name = "tmp." + next(tempfile._get_candidate_names()) + next(tempfile._get_candidate_names())
        work_dir = os.path.join(tempfile._get_default_tempdir(), work_dir_name)
        script_dir = os.path.dirname(self.script)
        
        user = getuser()
        self._work_dir = work_dir
        ips = self._getIPs()
        servers = TestEnvironment.Get().getServers(ips)
            
        #########################
        # Kill other instances: #
        #########################
        apps_to_kill = ["tf_cnn_benchmarks.py", "ml_monitor"]
        for app in apps_to_kill:
            kill_cmd = "ps -f | grep %s | grep -v grep | grep -v %s | sed -e 's@%s *\\([0-9]\\+\\) .*@\\1@g' | xargs kill -9" % (app, work_dir, user)
            res = self.runInline(kill_cmd, servers, wait_timeout = 5)
            if not res:
                return False
        if self._stop:
            return False
    
        ##################
        # Build cluster: #
        ##################
        port = self.base_port
        self._cluster_ps = []
        self._cluster_workers = []
        for ip in self.ps:
            self._cluster_ps.append("%s:%u" % (ip, port))
            port += 1
        for ip in self.workers:
            self._cluster_workers.append("%s:%u" % (ip, port))
            port += 1
    
        #########
        # Copy: #
        #########
        title("Copying scripts:", UniBorder.BORDER_STYLE_SINGLE)    
        if not self.runSCP(servers, [script_dir], work_dir, wait_timeout = 10): # Also create it
            return False
        if self._stop:
            return False
            
        ########
        # Run: #
        ########
        self._openPerformanceFile()
        self._getDevices(ips)
        
        title("Running:", UniBorder.BORDER_STYLE_SINGLE)
        processes = []
        if self.mode == TFCnnBenchmarksStep.MODE_PARAMETER_SERVER:
            for i in range(len(self.ps)):
                ip = self.ps[i]
                process = self._runJob(work_dir, ip, "ps", i)
                processes.append(process)
        elif self.mode == TFCnnBenchmarksStep.MODE_DISTRIBUTED_ALL_REDUCE:
            process = self._runJob(work_dir, ip, "controller", 0)
            processes.append(process) 
        ################
        # Run workers: #
        ################
        if self.mode == TFCnnBenchmarksStep.MODE_LOCAL:
            process = self._runJob(work_dir, self.workers[0], "worker", 0)
            processes.append(process)
        else:
            for i in range(len(self.workers)):
                ip = self.workers[i]
                process = self._runJob(work_dir, ip, "worker", i)
                processes.append(process)
        
        time.sleep(0.5)
        res = self._findRemoteProcessIDs(processes)
        if not res or self._stop:
            return False
        
        for server in self._servers.values():
            if not self._initServerMonitors(server):
                return False

        res = waitForProcesses(processes, 
                               wait_timeout=600,
                               on_output=self._onOut,
                               on_process_start=self._onJobStart,
                               on_process_done=self._onJobDone)

        for server in self._servers.values():
            res = res and self._stopServerMonitors(server)
        
        if not res or self._stop:
            return False

        self._appendToPerformanceFile()
        
        ############
        # Cleanup: #
        ############
        title("Cleaning:", UniBorder.BORDER_STYLE_SINGLE)
        sources = ["%s:%s %s:%s" % (server, os.path.join(self._work_dir, "graph.txt"), server, os.path.join(self._work_dir, "*.json")) for server in servers]
        dst = self._logs_dir
        cmd = "scp %s %s" % (" ".join(sources), dst)
        self.runInline(cmd)
        processes = executeRemoteCommand(servers, "rm -rf %s" % work_dir)
        waitForProcesses(processes, wait_timeout=10)
        return True
Exemple #6
0
 def runSCP(self, servers, sources, remote_dir, wait_timeout = None):
     ''' Run SCP. Always inline. '''
     processes = copyToRemote(servers, sources, remote_dir)
     return waitForProcesses(processes, 
                             wait_timeout = wait_timeout,
                             on_output = Step.logToMainProcess)