def _findRemoteProcessIDs(self, processes): remote_process_ids = {} def parser(line, find_process): debug(line) key = find_process.name remote_process_ids[key] = int(line.split()[0]) res = True num_attempts = 0 max_num_attempts = 3 while len(remote_process_ids) < len(processes): find_processes = [] for process in processes: if process.name in remote_process_ids: continue find_process = executeRemoteCommand([process.server], "ps --no-headers -o\"pid,args\" | grep -e '^ *[0-9]\+ %s$'" % process.tf_command)[0] find_process.name = process.name find_processes.append(find_process) waitForProcesses(find_processes, wait_timeout=5, on_output=parser) time.sleep(1) num_attempts += 1 if num_attempts == max_num_attempts: error("Failed to find remote process IDs. Most likely some processes failed to run.") res = False break table = FormattedTable() table.border_style = UniBorder.BORDER_STYLE_SINGLE table.addColumn(FormattedTable.Column("IP")) table.addColumn(FormattedTable.Column("Job")) table.addColumn(FormattedTable.Column("#")) table.addColumn(FormattedTable.Column("PID")) table.addColumn(FormattedTable.Column("RPID")) table.addColumn(FormattedTable.Column("Flags")) table.addColumn(FormattedTable.Column("Command")) for process in processes: if process.name in remote_process_ids: process.remote_pid = remote_process_ids[process.name] else: process.remote_pid = -1 table.addRow([process.server_info.ip, process.job_name, process.task_id, process.instance.pid, process.remote_pid, process.tf_flags, process.tf_command]) table.printFormatted(LogWriter(None, LOG_LEVEL_NOTE)) return res
def _runCommand(self, cmd, servers, wait_timeout, on_output, on_process_start, on_process_done, factory): processes = [] if servers is None: processes.append(executeCommand(cmd, factory=factory)) else: processes.extend(executeRemoteCommand(servers, cmd, factory=factory)) return waitForProcesses(processes, wait_timeout=wait_timeout, on_output=on_output, on_process_start=on_process_start, on_process_done=on_process_done)
def _getDevices(self, ips): links = [] self._devices = {} def linkNameParser(line, process): debug(line) links.append(line) def deviceNameAndPortParser(line, process): debug(line) parts = line.split() res = RemoteDeviceInfo() res.name = parts[0] res.port = int(parts[2]) res.is_up = parts[5] == "(Up)" self._devices[process.server] = res procs = [] for ip in ips: server = TestEnvironment.Get().getServer(ip) procs.extend(executeRemoteCommand([server], "ip -o a s | grep %s | cut -d ' ' -f 2 | cut -d'.' -f1" % ip)) if not waitForProcesses(procs, wait_timeout=5, on_output=linkNameParser): for proc in procs: if proc.exception is not None: raise proc.exception raise Exception("Internal Error") procs = [] for ip in ips: server = TestEnvironment.Get().getServer(ip) i = len(procs) link = links[i] procs.extend(executeRemoteCommand([server], "ibdev2netdev | grep %s" % link)) if not waitForProcesses(procs, wait_timeout=5, on_output=deviceNameAndPortParser): for proc in procs: if proc.exception is not None: raise proc.exception raise Exception("Internal Error")
def perform(self, index): Step.perform(self, index) ########## # Build: # ########## title("Building:", UniBorder.BORDER_STYLE_SINGLE) config_cuda = "--config=cuda" if self.config_cuda else "" if self.additional_flags == [""]: additional_flags = "" else: additional_flags = "--copt \"%s\"" % " ".join( self.additional_flags) cmd = "cd %s; rm -rf tensorflow_pkg; bazel build -c opt %s %s //tensorflow/tools/pip_package:build_pip_package" % ( self.tensorflow_home, config_cuda, additional_flags) res = self.runSeperate(cmd, title="Build %s" % self.tensorflow_home, log_file_path=os.path.join( self._logs_dir, "build.log"), wait_timeout=3600) if not res: return False cmd = "cd %s; bazel-bin/tensorflow/tools/pip_package/build_pip_package tensorflow_pkg" % ( self.tensorflow_home) res = self.runInline(cmd, wait_timeout=60) if not res: return False ############ # Install: # ############ servers = TestEnvironment.Get().getServers(self.install_servers) title("Installing:", UniBorder.BORDER_STYLE_SINGLE) src_dir = os.path.join(self.tensorflow_home, "tensorflow_pkg") temp_dir_name = "tmp." + next(tempfile._get_candidate_names()) + next( tempfile._get_candidate_names()) temp_dir = os.path.join(tempfile._get_default_tempdir(), temp_dir_name) res = self.runSCP(servers, [src_dir], temp_dir, wait_timeout=10) if not res: return False cmd = "pip install --user --upgrade %s/tensorflow-*" % temp_dir process_title = lambda process: "Installing on %s..." % process.server log_file_path = lambda process: os.path.join( self._logs_dir, "install_%s.log" % re.sub("[^0-9a-zA-Z]", "_", process.server)) res = self.runSeperate(cmd, title=process_title, servers=servers, log_file_path=log_file_path) if not res: return False ########## # Clean: # ########## title("Cleaning:", UniBorder.BORDER_STYLE_SINGLE) processes = executeRemoteCommand(servers, "rm -rf %s" % temp_dir) res = waitForProcesses(processes, wait_timeout=10) if not res: return False return True
def perform(self, index): Step.perform(self, index) log("<img src='%s' width=600 style='border:1px solid black'/>" % pkg_resources.resource_filename("mltester", "images/tensorflow.jpg")) #https://www.skylinelabs.in/blog/images/tensorflow.jpg?width=500'/>") for attr in self._attributes: log(" + %s: %s" % (attr.desc.display_name, str(attr.val))) self._perf = TFPerformanceMeasurements() self._processes = [] self._stopping = False self._servers = {} work_dir_name = "tmp." + next(tempfile._get_candidate_names()) + next(tempfile._get_candidate_names()) work_dir = os.path.join(tempfile._get_default_tempdir(), work_dir_name) script_dir = os.path.dirname(self.script) user = getuser() self._work_dir = work_dir ips = self._getIPs() servers = TestEnvironment.Get().getServers(ips) ######################### # Kill other instances: # ######################### apps_to_kill = ["tf_cnn_benchmarks.py", "ml_monitor"] for app in apps_to_kill: kill_cmd = "ps -f | grep %s | grep -v grep | grep -v %s | sed -e 's@%s *\\([0-9]\\+\\) .*@\\1@g' | xargs kill -9" % (app, work_dir, user) res = self.runInline(kill_cmd, servers, wait_timeout = 5) if not res: return False if self._stop: return False ################## # Build cluster: # ################## port = self.base_port self._cluster_ps = [] self._cluster_workers = [] for ip in self.ps: self._cluster_ps.append("%s:%u" % (ip, port)) port += 1 for ip in self.workers: self._cluster_workers.append("%s:%u" % (ip, port)) port += 1 ######### # Copy: # ######### title("Copying scripts:", UniBorder.BORDER_STYLE_SINGLE) if not self.runSCP(servers, [script_dir], work_dir, wait_timeout = 10): # Also create it return False if self._stop: return False ######## # Run: # ######## self._openPerformanceFile() self._getDevices(ips) title("Running:", UniBorder.BORDER_STYLE_SINGLE) processes = [] if self.mode == TFCnnBenchmarksStep.MODE_PARAMETER_SERVER: for i in range(len(self.ps)): ip = self.ps[i] process = self._runJob(work_dir, ip, "ps", i) processes.append(process) elif self.mode == TFCnnBenchmarksStep.MODE_DISTRIBUTED_ALL_REDUCE: process = self._runJob(work_dir, ip, "controller", 0) processes.append(process) ################ # Run workers: # ################ if self.mode == TFCnnBenchmarksStep.MODE_LOCAL: process = self._runJob(work_dir, self.workers[0], "worker", 0) processes.append(process) else: for i in range(len(self.workers)): ip = self.workers[i] process = self._runJob(work_dir, ip, "worker", i) processes.append(process) time.sleep(0.5) res = self._findRemoteProcessIDs(processes) if not res or self._stop: return False for server in self._servers.values(): if not self._initServerMonitors(server): return False res = waitForProcesses(processes, wait_timeout=600, on_output=self._onOut, on_process_start=self._onJobStart, on_process_done=self._onJobDone) for server in self._servers.values(): res = res and self._stopServerMonitors(server) if not res or self._stop: return False self._appendToPerformanceFile() ############ # Cleanup: # ############ title("Cleaning:", UniBorder.BORDER_STYLE_SINGLE) sources = ["%s:%s %s:%s" % (server, os.path.join(self._work_dir, "graph.txt"), server, os.path.join(self._work_dir, "*.json")) for server in servers] dst = self._logs_dir cmd = "scp %s %s" % (" ".join(sources), dst) self.runInline(cmd) processes = executeRemoteCommand(servers, "rm -rf %s" % work_dir) waitForProcesses(processes, wait_timeout=10) return True
def runSCP(self, servers, sources, remote_dir, wait_timeout = None): ''' Run SCP. Always inline. ''' processes = copyToRemote(servers, sources, remote_dir) return waitForProcesses(processes, wait_timeout = wait_timeout, on_output = Step.logToMainProcess)