def _runCommand(self, cmd, servers, wait_timeout, on_output, on_process_start, on_process_done, factory): processes = [] if servers is None: processes.append(executeCommand(cmd, factory=factory)) else: processes.extend(executeRemoteCommand(servers, cmd, factory=factory)) return waitForProcesses(processes, wait_timeout=wait_timeout, on_output=on_output, on_process_start=on_process_start, on_process_done=on_process_done)
def _getDevices(self, ips): links = [] self._devices = {} def linkNameParser(line, process): debug(line) links.append(line) def deviceNameAndPortParser(line, process): debug(line) parts = line.split() res = RemoteDeviceInfo() res.name = parts[0] res.port = int(parts[2]) res.is_up = parts[5] == "(Up)" self._devices[process.server] = res procs = [] for ip in ips: server = TestEnvironment.Get().getServer(ip) procs.extend(executeRemoteCommand([server], "ip -o a s | grep %s | cut -d ' ' -f 2 | cut -d'.' -f1" % ip)) if not waitForProcesses(procs, wait_timeout=5, on_output=linkNameParser): for proc in procs: if proc.exception is not None: raise proc.exception raise Exception("Internal Error") procs = [] for ip in ips: server = TestEnvironment.Get().getServer(ip) i = len(procs) link = links[i] procs.extend(executeRemoteCommand([server], "ibdev2netdev | grep %s" % link)) if not waitForProcesses(procs, wait_timeout=5, on_output=deviceNameAndPortParser): for proc in procs: if proc.exception is not None: raise proc.exception raise Exception("Internal Error")
def _findRemoteProcessIDs(self, processes): remote_process_ids = {} def parser(line, find_process): debug(line) key = find_process.name remote_process_ids[key] = int(line.split()[0]) res = True num_attempts = 0 max_num_attempts = 3 while len(remote_process_ids) < len(processes): find_processes = [] for process in processes: if process.name in remote_process_ids: continue find_process = executeRemoteCommand([process.server], "ps --no-headers -o\"pid,args\" | grep -e '^ *[0-9]\+ %s$'" % process.tf_command)[0] find_process.name = process.name find_processes.append(find_process) waitForProcesses(find_processes, wait_timeout=5, on_output=parser) time.sleep(1) num_attempts += 1 if num_attempts == max_num_attempts: error("Failed to find remote process IDs. Most likely some processes failed to run.") res = False break table = FormattedTable() table.border_style = UniBorder.BORDER_STYLE_SINGLE table.addColumn(FormattedTable.Column("IP")) table.addColumn(FormattedTable.Column("Job")) table.addColumn(FormattedTable.Column("#")) table.addColumn(FormattedTable.Column("PID")) table.addColumn(FormattedTable.Column("RPID")) table.addColumn(FormattedTable.Column("Flags")) table.addColumn(FormattedTable.Column("Command")) for process in processes: if process.name in remote_process_ids: process.remote_pid = remote_process_ids[process.name] else: process.remote_pid = -1 table.addRow([process.server_info.ip, process.job_name, process.task_id, process.instance.pid, process.remote_pid, process.tf_flags, process.tf_command]) table.printFormatted(LogWriter(None, LOG_LEVEL_NOTE)) return res
def perform(self, index): Step.perform(self, index) ########## # Build: # ########## title("Building:", UniBorder.BORDER_STYLE_SINGLE) config_cuda = "--config=cuda" if self.config_cuda else "" if self.additional_flags == [""]: additional_flags = "" else: additional_flags = "--copt \"%s\"" % " ".join( self.additional_flags) cmd = "cd %s; rm -rf tensorflow_pkg; bazel build -c opt %s %s //tensorflow/tools/pip_package:build_pip_package" % ( self.tensorflow_home, config_cuda, additional_flags) res = self.runSeperate(cmd, title="Build %s" % self.tensorflow_home, log_file_path=os.path.join( self._logs_dir, "build.log"), wait_timeout=3600) if not res: return False cmd = "cd %s; bazel-bin/tensorflow/tools/pip_package/build_pip_package tensorflow_pkg" % ( self.tensorflow_home) res = self.runInline(cmd, wait_timeout=60) if not res: return False ############ # Install: # ############ servers = TestEnvironment.Get().getServers(self.install_servers) title("Installing:", UniBorder.BORDER_STYLE_SINGLE) src_dir = os.path.join(self.tensorflow_home, "tensorflow_pkg") temp_dir_name = "tmp." + next(tempfile._get_candidate_names()) + next( tempfile._get_candidate_names()) temp_dir = os.path.join(tempfile._get_default_tempdir(), temp_dir_name) res = self.runSCP(servers, [src_dir], temp_dir, wait_timeout=10) if not res: return False cmd = "pip install --user --upgrade %s/tensorflow-*" % temp_dir process_title = lambda process: "Installing on %s..." % process.server log_file_path = lambda process: os.path.join( self._logs_dir, "install_%s.log" % re.sub("[^0-9a-zA-Z]", "_", process.server)) res = self.runSeperate(cmd, title=process_title, servers=servers, log_file_path=log_file_path) if not res: return False ########## # Clean: # ########## title("Cleaning:", UniBorder.BORDER_STYLE_SINGLE) processes = executeRemoteCommand(servers, "rm -rf %s" % temp_dir) res = waitForProcesses(processes, wait_timeout=10) if not res: return False return True
def perform(self, index): Step.perform(self, index) log("<img src='%s' width=600 style='border:1px solid black'/>" % pkg_resources.resource_filename("mltester", "images/tensorflow.jpg")) #https://www.skylinelabs.in/blog/images/tensorflow.jpg?width=500'/>") for attr in self._attributes: log(" + %s: %s" % (attr.desc.display_name, str(attr.val))) self._perf = TFPerformanceMeasurements() self._processes = [] self._stopping = False self._servers = {} work_dir_name = "tmp." + next(tempfile._get_candidate_names()) + next(tempfile._get_candidate_names()) work_dir = os.path.join(tempfile._get_default_tempdir(), work_dir_name) script_dir = os.path.dirname(self.script) user = getuser() self._work_dir = work_dir ips = self._getIPs() servers = TestEnvironment.Get().getServers(ips) ######################### # Kill other instances: # ######################### apps_to_kill = ["tf_cnn_benchmarks.py", "ml_monitor"] for app in apps_to_kill: kill_cmd = "ps -f | grep %s | grep -v grep | grep -v %s | sed -e 's@%s *\\([0-9]\\+\\) .*@\\1@g' | xargs kill -9" % (app, work_dir, user) res = self.runInline(kill_cmd, servers, wait_timeout = 5) if not res: return False if self._stop: return False ################## # Build cluster: # ################## port = self.base_port self._cluster_ps = [] self._cluster_workers = [] for ip in self.ps: self._cluster_ps.append("%s:%u" % (ip, port)) port += 1 for ip in self.workers: self._cluster_workers.append("%s:%u" % (ip, port)) port += 1 ######### # Copy: # ######### title("Copying scripts:", UniBorder.BORDER_STYLE_SINGLE) if not self.runSCP(servers, [script_dir], work_dir, wait_timeout = 10): # Also create it return False if self._stop: return False ######## # Run: # ######## self._openPerformanceFile() self._getDevices(ips) title("Running:", UniBorder.BORDER_STYLE_SINGLE) processes = [] if self.mode == TFCnnBenchmarksStep.MODE_PARAMETER_SERVER: for i in range(len(self.ps)): ip = self.ps[i] process = self._runJob(work_dir, ip, "ps", i) processes.append(process) elif self.mode == TFCnnBenchmarksStep.MODE_DISTRIBUTED_ALL_REDUCE: process = self._runJob(work_dir, ip, "controller", 0) processes.append(process) ################ # Run workers: # ################ if self.mode == TFCnnBenchmarksStep.MODE_LOCAL: process = self._runJob(work_dir, self.workers[0], "worker", 0) processes.append(process) else: for i in range(len(self.workers)): ip = self.workers[i] process = self._runJob(work_dir, ip, "worker", i) processes.append(process) time.sleep(0.5) res = self._findRemoteProcessIDs(processes) if not res or self._stop: return False for server in self._servers.values(): if not self._initServerMonitors(server): return False res = waitForProcesses(processes, wait_timeout=600, on_output=self._onOut, on_process_start=self._onJobStart, on_process_done=self._onJobDone) for server in self._servers.values(): res = res and self._stopServerMonitors(server) if not res or self._stop: return False self._appendToPerformanceFile() ############ # Cleanup: # ############ title("Cleaning:", UniBorder.BORDER_STYLE_SINGLE) sources = ["%s:%s %s:%s" % (server, os.path.join(self._work_dir, "graph.txt"), server, os.path.join(self._work_dir, "*.json")) for server in servers] dst = self._logs_dir cmd = "scp %s %s" % (" ".join(sources), dst) self.runInline(cmd) processes = executeRemoteCommand(servers, "rm -rf %s" % work_dir) waitForProcesses(processes, wait_timeout=10) return True
def _runJob(self, work_dir, ip, job_name, task_id): hostname = TestEnvironment.Get().getServer(ip) server_info = self._getOrCreateServer(hostname, ip) device_info = self._devices[server_info.hostname] ##################### # Build TF command: # ##################### tf_flags = "" tf_command = "" ################## # Env variables: # ################## tf_flags += "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/gdrcopy" tf_flags += " TF_CPP_MIN_VLOG_LEVEL=%s" % self.log_level tf_flags += " RDMA_DEVICE=%s" % device_info.name tf_flags += " RDMA_DEVICE_PORT=%u" % device_info.port tf_flags += " RDMA_GID_INDEX=3" tf_flags += " RDMA_PKEY=0" tf_flags += " RDMA_QUEUE_DEPTH=1024" tf_flags += " RDMA_TIMEOUT=10" tf_flags += " RDMA_RETRY_CNT=10" tf_flags += " RDMA_SL=1" tf_flags += " RDMA_MTU=512" tf_flags += " RDMA_TRAFFIC_CLASS=8" tf_flags += " UCX_NET_DEVICES=%s:%u" % (device_info.name, device_info.port) if (job_name in ["ps", "controller"]) or (self.num_gpus == 0): tf_flags += " CUDA_VISIBLE_DEVICES=" ############## # UCX stuff: # ############## # Ucx should be compiled ./contrib/configure-devel --enable-debug if self.server_protocol == "grpc+ucx": tf_flags += " UCX_LOG_LEVEL=data" tf_flags += " UCX_TLS=rc_x,gdr_copy,cuda_copy" #export UCX_IB_ETH_PAUSE_ON=y #export UCX_LOG_LEVEL=trace ############### # GRPC Debug: # ############### #export GRPC_VERBOSITY=DEBUG #export GRPC_TRACE=api,call_combiner #export GRPC_TRACE=queue_pluck,flowctl,http1,http2_stream_state,http,op_failure #export GRPC_TRACE=client_channel,call_error,channel,server_channel,channel_stack_builder,connectivity_state #all ############## # Arguments: # ############## # tf_command += " gdb --args" tf_command += "python -u %s/tf_cnn_benchmarks.py" % self._work_dir if self.mode != TFCnnBenchmarksStep.MODE_LOCAL: tf_command += " --job_name=%s" % job_name tf_command += " --task_index=%u" % task_id tf_command += " --worker_hosts=%s" % ",".join(self._cluster_workers) if self.mode == TFCnnBenchmarksStep.MODE_PARAMETER_SERVER: tf_command += " --ps_hosts=%s" % ",".join(self._cluster_ps) elif self.mode == TFCnnBenchmarksStep.MODE_DISTRIBUTED_ALL_REDUCE: tf_command += " --variable_update=distributed_all_reduce" tf_command += " --all_reduce_spec=%s" % self.all_reduce_spec if job_name in ["worker", "controller"]: tf_command += " --model=%s" % self.model tf_command += " --batch_size=%s" % self.batch_size if self.data_dir != "": tf_command += " --data_dir=%s" % self.data_dir if self.num_gpus > 0: tf_command += " --num_gpus=%s --local_parameter_device=gpu" % self.num_gpus if self.trace_file: tf_command += "--trace_file=trace_%s_%u.json" % (job_name, task_id) if self.mode != TFCnnBenchmarksStep.MODE_LOCAL: tf_command += " --server_protocol=%s" % self.server_protocol if self.forward_only: tf_command += " --forward_only" if job_name == "worker": if self.model_graph_file and (task_id == 0): tf_command += " --graph_file=%s" % os.path.join(self._work_dir, "graph.txt") if self.trace_file: tf_command += " --trace_file=%s" % os.path.join(self._work_dir, "trace_%s_%u.json" % (job_name, task_id)) command = tf_flags + " " + tf_command title = "[%s] %s - %u" % (ip, job_name, task_id) log_file_path = os.path.join(self._logs_dir, "%s_%u.log" % (job_name, task_id)) factory = BasicProcess.getFactory(title, log_file_path) process = executeRemoteCommand([server_info.hostname], command, factory = factory)[0] process.name = "%s_%u" % (job_name, task_id) process.job_name = job_name process.task_id = task_id process.is_worker = job_name == "worker" process.rdma_device = device_info process.server_info = server_info process.tf_flags = tf_flags process.tf_command = tf_command process.remote_pid = None self._processes.append(process) server_info.processes.append(process) return process