def sofa_analyze(cfg): print_main_progress('SOFA analyzing...') filein = [] df_cpu = pd.DataFrame([], columns=cfg.columns) df_gpu = pd.DataFrame([], columns=cfg.columns) df_net = pd.DataFrame([], columns=cfg.columns) df_mpstat = pd.DataFrame([], columns=cfg.columns) df_vmstat = pd.DataFrame([], columns=cfg.columns) df_bandwidth = pd.DataFrame([], columns=cfg.columns) df_blktrace = pd.DataFrame([], columns=cfg.columns) df_diskstat = pd.DataFrame([], columns=cfg.columns) df_nvsmi = pd.DataFrame([], columns=cfg.columns) iter_summary = None logdir = cfg.logdir with open(logdir+'/misc.txt') as f: lines = f.readlines() elapsed_time = float(lines[0].split()[1]) vcores = int(lines[2].split()[1]) cfg.elapsed_time = float(lines[0].split()[1]) filein_gpu = logdir + "gputrace.csv" filein_cpu = logdir + "cputrace.csv" filein_net = logdir + "nettrace.csv" filein_vmstat = logdir + "vmstat.csv" filein_mpstat = logdir + "mpstat.csv" filein_strace = logdir + "strace.csv" filein_nvsmi = logdir + "nvsmi_trace.csv" filein_bandwidth = logdir + "netstat.csv" filein_blktrace = logdir + "blktrace.csv" filein_diskstat = logdir + "diskstat_vector.csv" if os.path.isfile('%s/nvlink_topo.txt' % logdir): with open(logdir + 'nvlink_topo.txt') as f: lines = f.readlines() if len(lines) > 0: title = lines[0] num_gpus = 1 for word in title.split(): if re.match(r'GPU', word) != None : num_gpus = num_gpus + 1 print_info(cfg,'# of GPUs: ' + str(num_gpus) ) edges = [] if len(lines) >= num_gpus+1: for i in range(num_gpus): connections = lines[1+i].split() for j in range(len(connections)): if connections[j] == 'NV1' or connections[j] == 'NV2': edges.append((i,j-1)) #print('%d connects to %d' % (i, j-1)) ring_found = False G = nx.DiGraph(edges) # Try to find ring with its length of num_gpus for cycle in nx.simple_cycles(G): if len(cycle) == num_gpus: if cfg.verbose: print('One of the recommended ring having length of %d' % len(cycle)) ring_found = True os.system("mkdir -p sofalog/sofa_hints/") xring_order = ','.join(map(str, cycle)) with open("sofalog/sofa_hints/xring_order.txt", "w") as f: f.write('export CUDA_VISIBLE_DEVICES=' + xring_order) break # Try to find ring with its length of num_gpus/2 if not ring_found: for cycle in nx.simple_cycles(G): if len(cycle) == num_gpus/2: print(("One of the recommended ring having length of %d" % len(cycle) )) ring_found = True os.system("mkdir -p sofalog/sofa_hints/") xring_order = ','.join(map(str, cycle)) with open("sofalog/sofa_hints/xring_order.txt", "w") as f: f.write('export CUDA_VISIBLE_DEVICES=' + xring_order) break # Construct Performance Features features = pd.DataFrame({'name':['elapsed_time'], 'value':[cfg.elapsed_time]}, columns=['name','value']) try: df_nvsmi = pd.read_csv(filein_nvsmi) if not df_nvsmi.empty and cfg.spotlight_gpu: state = 0 sm_high = 0 trigger = 10 for i in range(len(df_nvsmi)): if df_nvsmi.iloc[i].event == 0 and df_nvsmi.iloc[i].deviceId == 0 : if df_nvsmi.iloc[i].duration >= 50: sm_high = min(trigger, sm_high + 1) if df_nvsmi.iloc[i].duration < 10: sm_high = max(0, sm_high - 1) if state == 0 and sm_high == trigger: state = 1 cfg.roi_begin = df_nvsmi.iloc[i].timestamp elif state == 1 and sm_high == 0: state = 0 cfg.roi_end = df_nvsmi.iloc[i].timestamp #print('sm_high=%d state=%d' % (sm_high, state)) if cfg.roi_end - cfg.roi_begin < 0: cfg.roi_end = 0 cfg.roi_begin = 0 except IOError: print_warning(cfg, "nvsmi_trace.csv is not found") try: df_cpu = pd.read_csv(filein_cpu) if not df_cpu.empty: if cfg.verbose: cpu_profile(logdir, cfg, df_cpu) if cfg.enable_swarms and len(df_cpu) > cfg.num_swarms: df_cpu, swarms = hsg_v2(cfg, df_cpu) except IOError as e: df_cpu = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found" % filein_cpu) try: df_strace = pd.read_csv(filein_strace) if not df_strace.empty: features = strace_profile(logdir, cfg, df_strace, features) except IOError as e: df_strace = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found" % filein_strace) try: df_net = pd.read_csv(filein_net) if not df_net.empty: features = net_profile(logdir, cfg, df_net, features) except IOError as e: df_net = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found" % filein_net) try: df_bandwidth = pd.read_csv(filein_bandwidth) if not df_bandwidth.empty: features = netbandwidth_profile(logdir, cfg, df_bandwidth, features) except IOError as e: df_bandwidth = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found" % filein_bandwidth) try: df_blktrace = pd.read_csv(filein_blktrace) if not df_blktrace.empty: features = blktrace_latency_profile(logdir, cfg, df_blktrace, features) except IOError as e: df_blktrace = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found" % filein_blktrace) try: df_diskstat = pd.read_csv(filein_diskstat) if not df_diskstat.empty: features = diskstat_profile(logdir, cfg, df_diskstat, features) except IOError as e: df_diskstat = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found" % filein_diskstat) try: df_vmstat = pd.read_csv(filein_vmstat) if not df_vmstat.empty: features = vmstat_profile(logdir, cfg, df_vmstat, features) except IOError as e: df_vmstat = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found" % filein_vmstat) try: df_mpstat = pd.read_csv(filein_mpstat) if not df_mpstat.empty: features = mpstat_profile(logdir, cfg, df_mpstat, features) except IOError as e: df_mpstat = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found" % filein_mpstat) try: df_nvsmi = pd.read_csv(filein_nvsmi) features = nvsmi_profile(logdir, cfg, df_nvsmi, features) except IOError: print_warning(cfg, "nvsmi_trace.csv is not found") try: df_gpu = pd.read_csv(filein_gpu) if not df_gpu.empty: features = gpu_profile(logdir, cfg, df_gpu, features) except IOError: df_gpu = pd.DataFrame([], columns=cfg.columns) print_warning(cfg, "%s is not found. If there is no need to profile GPU, just ignore it." % filein_gpu) try: if len(df_mpstat)>0: df_nvsmi.append(df_mpstat.iloc[0]) features = concurrency_breakdown(logdir, cfg, df_mpstat, df_cpu, df_gpu, df_nvsmi, df_bandwidth, features) except IOError as e: print_warning(cfg, "Some files are not found, which are needed for concurrency_breakdown analysis") if cfg.enable_aisi: selected_pattern, iter_summary, features = sofa_aisi(logdir, cfg, df_cpu, df_gpu, df_strace, df_mpstat, features) if 'IS_SOFA_ON_HAIHUB' not in os.environ or os.environ['IS_SOFA_ON_HAIHUB'] == 'no': print_title('Final Performance Features') print('%s%s%s%s' % ('ID'.ljust(10),'Feature'.ljust(30),'Value'.ljust(20),'Unit'.ljust(20)) ) for i in range(len(features)): name = features.iloc[i]['name'] value = features.iloc[i]['value'] print('%s%s%s' % (str(i).ljust(10), name.ljust(30), ('%.3lf'%value).ljust(20))) if cfg.spotlight_gpu: try: print('Elapsed hotspot time: %.3lf' % features[features.name=='elapsed_hotspot_time'].value) except: print_warning(cfg, 'elpased_hostspot_time is not defined.') if cfg.potato_server: if cfg.potato_server.find(':') == -1: cfg.potato_server = cfg.potato_server + ':50051' hint, docker_image = get_hint(cfg.potato_server, features) df_report = pd.read_json(hint, orient='table') file_potato_report = cfg.logdir + 'potato_report.html' # Export report to HTML file. df_report.to_html(file_potato_report ) with open(file_potato_report, 'a') as f: f.write('<head><link rel=stylesheet type="text/css" href="potato_report.css"></head>') print_title('POTATO Feedback') print('%s%s%s%s' % ('ID'.ljust(5), 'Metric'.ljust(20), 'Value'.ljust(10), 'Reference-Value'.ljust(30) ) ) for i in range(len(df_report)): metric = df_report.iloc[i]['Metric'] if metric != 'hybrid_suggestion': value = df_report.iloc[i]['Value'] ref_value = df_report.iloc[i]['ReferenceValue'] print('%s%s%s%s' % (str(i).ljust(5), metric.ljust(20), ('%.3lf'%value).ljust(20), str(ref_value).ljust(30))) print('\n') print_hint('General Suggestions:') for i in range(len(df_report)): metric = df_report.iloc[i]['Metric'] if metric != 'hybrid_suggestion': suggestion = df_report.iloc[i]['Suggestion'] print('%d. %s' % (i, suggestion)) print('\n') print_hint('Framework-specific Optimization Suggestions:') for i in range(len(df_report)): metric = df_report.iloc[i]['Metric'] if metric == 'hybrid_suggestion': suggestion = df_report.iloc[i]['Suggestion'] print('%d. %s' % (i, suggestion)) #print(df_report[['Metric', 'Value', 'Reference Value']]) #print(df_report[['Suggestion']]) #print('Tag of optimal image recommended from POTATO: ' + highlight(docker_image)) print('\n') print_hint('Please re-launch KubeFlow Jupyter-notebook to have suggested images or resources if necessary.') sofa_home = os.path.dirname(os.path.realpath(__file__)) subprocess.Popen( ['bash', '-c', 'cp %s/../sofaboard/* %s;' % (sofa_home, cfg.logdir)]) subprocess.Popen(['sleep', '2']) print('\n\n') print('Complete!!')
def sofa_analyze(cfg): filein = [] df_cpu = pd.DataFrame([], columns=cfg.columns) df_gpu = pd.DataFrame([], columns=cfg.columns) df_net = pd.DataFrame([], columns=cfg.columns) df_mpstat = pd.DataFrame([], columns=cfg.columns) df_vmstat = pd.DataFrame([], columns=cfg.columns) df_bandwidth = pd.DataFrame([], columns=cfg.columns) df_blktrace = pd.DataFrame([], columns=cfg.columns) df_diskstat = pd.DataFrame([], columns=cfg.columns) df_nvsmi = pd.DataFrame([], columns=cfg.columns) iter_summary = None logdir = cfg.logdir with open(logdir+'/misc.txt') as f: lines = f.readlines() elapsed_time = float(lines[0].split()[1]) vcores = int(lines[2].split()[1]) cfg.elapsed_time = float(lines[0].split()[1]) filein_gpu = logdir + "gputrace.csv" filein_cpu = logdir + "cputrace.csv" filein_net = logdir + "nettrace.csv" filein_vmstat = logdir + "vmstat.csv" filein_mpstat = logdir + "mpstat.csv" filein_strace = logdir + "strace.csv" filein_nvsmi = logdir + "nvsmi_trace.csv" filein_bandwidth = logdir + "netstat.csv" filein_blktrace = logdir + "blktrace.csv" filein_diskstat = logdir + "diskstat.csv" if os.path.isfile('%s/nvlink_topo.txt' % logdir): with open(logdir + 'nvlink_topo.txt') as f: lines = f.readlines() if len(lines) > 0: title = lines[0] num_gpus = 1 for word in title.split(): if re.match(r'GPU', word) != None : num_gpus = num_gpus + 1 print_info(cfg,'# of GPUs: ' + str(num_gpus) ) edges = [] if len(lines) >= num_gpus+1: for i in range(num_gpus): connections = lines[1+i].split() for j in range(len(connections)): if connections[j] == 'NV1' or connections[j] == 'NV2': edges.append((i,j-1)) #print('%d connects to %d' % (i, j-1)) ring_found = False G = nx.DiGraph(edges) # Try to find ring with its length of num_gpus for cycle in nx.simple_cycles(G): if len(cycle) == num_gpus: print(("One of the recommended ring having length of %d" % len(cycle) )) ring_found = True os.system("mkdir -p sofalog/sofa_hints/") xring_order = ','.join(map(str, cycle)) with open("sofalog/sofa_hints/xring_order.txt", "w") as f: f.write('export CUDA_VISIBLE_DEVICES=' + xring_order) break # Try to find ring with its length of num_gpus/2 if not ring_found: for cycle in nx.simple_cycles(G): if len(cycle) == num_gpus/2: print(("One of the recommended ring having length of %d" % len(cycle) )) ring_found = True os.system("mkdir -p sofalog/sofa_hints/") xring_order = ','.join(map(str, cycle)) with open("sofalog/sofa_hints/xring_order.txt", "w") as f: f.write('export CUDA_VISIBLE_DEVICES=' + xring_order) break # Construct Performance Features features = pd.DataFrame({'name':['elapsed_time'], 'value':[cfg.elapsed_time]}, columns=['name','value']) try: df_nvsmi = pd.read_csv(filein_nvsmi) if not df_nvsmi.empty and cfg.spotlight_gpu: state = 0 sm_high = 0 trigger = 10 for i in range(len(df_nvsmi)): if df_nvsmi.iloc[i].event == 0 and df_nvsmi.iloc[i].deviceId == 0 : if df_nvsmi.iloc[i].duration >= 50: sm_high = min(trigger, sm_high + 1) if df_nvsmi.iloc[i].duration < 10: sm_high = max(0, sm_high - 1) if state == 0 and sm_high == trigger: state = 1 cfg.roi_begin = df_nvsmi.iloc[i].timestamp elif state == 1 and sm_high == 0: state = 0 cfg.roi_end = df_nvsmi.iloc[i].timestamp #print('sm_high=%d state=%d' % (sm_high, state)) if cfg.roi_end - cfg.roi_begin < 0: cfg.roi_end = 0 cfg.roi_begin = 0 except IOError: print_warning("nvsmi_trace.csv is not found") try: df_cpu = pd.read_csv(filein_cpu) if not df_cpu.empty: cpu_profile(logdir, cfg, df_cpu) df_cpu, swarms = hsg_v2(cfg, df_cpu) except IOError as e: df_cpu = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_cpu) try: df_strace = pd.read_csv(filein_strace) if not df_strace.empty: features = strace_profile(logdir, cfg, df_strace, features) except IOError as e: df_strace = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_strace) try: df_net = pd.read_csv(filein_net) if not df_net.empty: features = net_profile(logdir, cfg, df_net, features) except IOError as e: df_net = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_net) try: df_bandwidth = pd.read_csv(filein_bandwidth) if not df_bandwidth.empty: features = netbandwidth_profile(logdir, cfg, df_bandwidth, features) except IOError as e: df_bandwidth = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_bandwidth) try: df_blktrace = pd.read_csv(filein_blktrace) if not df_blktrace.empty: print(df_blktrace) features = blktrace_latency_profile(logdir, cfg, df_blktrace, features) except IOError as e: df_blktrace = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_blktrace) try: df_diskstat = pd.read_csv(filein_diskstat) if not df_diskstat.empty: features = diskstat_profile(logdir, cfg, df_diskstat, features) except IOError as e: df_diskstat = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_diskstat) try: df_vmstat = pd.read_csv(filein_vmstat) if not df_vmstat.empty: features = vmstat_profile(logdir, cfg, df_vmstat, features) except IOError as e: df_vmstat = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_vmstat) try: df_mpstat = pd.read_csv(filein_mpstat) if not df_mpstat.empty: features = mpstat_profile(logdir, cfg, df_mpstat, features) except IOError as e: df_mpstat = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_mpstat) try: df_nvsmi = pd.read_csv(filein_nvsmi) features = nvsmi_profile(logdir, cfg, df_nvsmi, features) except IOError: print_warning("nvsmi_trace.csv is not found") try: df_gpu = pd.read_csv(filein_gpu) if not df_gpu.empty: features = gpu_profile(logdir, cfg, df_gpu, features) except IOError: df_gpu = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found. If there is no need to profile GPU, just ignore it." % filein_gpu) try: if df_nvsmi.empty: df_nvsmi.append(df_mpstat.iloc[0]) features = dynamic_top_down(logdir, cfg, df_mpstat, df_cpu, df_gpu, df_nvsmi, df_bandwidth, features) except IOError as e: print_warning("Some files are not found, which are needed for dynamic_top_down analysis") if cfg.enable_aisi: selected_pattern, iter_summary = sofa_aisi(logdir, cfg, df_cpu, df_gpu, df_strace, df_mpstat) print_title('Final Performance Features') print('%s%s%s' % ('ID'.ljust(10),'Feature'.ljust(30),'Value'.ljust(20)) ) for i in range(len(features)): name = features.iloc[i]['name'] value = features.iloc[i]['value'] print('%s%s%s' % (str(i).ljust(10), name.ljust(30), ('%.3lf'%value).ljust(20))) if cfg.potato_server: print_title('POTATO Feedback') if cfg.potato_server.find(':') == -1: cfg.potato_server = cfg.potato_server + ':50051' hint, docker_image = get_hint(cfg.potato_server, features) print('Optimization hints: \n') df_report = pd.read_json(hint, orient='table') print(df_report) file_potato_report = cfg.logdir + 'potato_report.html' df_report.to_html(file_potato_report ) with open(file_potato_report, 'a') as f: f.write('<head><link rel=stylesheet type="text/css" href="potato_report.css"></head>') print('Tag of optimal image recommended from POTATO: ' + highlight(docker_image)) print('Please re-launch KubeFlow Jupyter-notebook with the new tag.') print('\n\n')
def sofa_analyze(cfg): filein = [] df_cpu = pd.DataFrame([], columns=cfg.columns) df_gpu = pd.DataFrame([], columns=cfg.columns) df_net = pd.DataFrame([], columns=cfg.columns) df_mpstat = pd.DataFrame([], columns=cfg.columns) df_vmstat = pd.DataFrame([], columns=cfg.columns) iter_summary = None logdir = cfg.logdir with open(logdir + '/misc.txt') as f: lines = f.readlines() elapsed_time = float(lines[0].split()[1]) vcores = int(lines[2].split()[1]) cfg.elapsed_time = float(lines[0].split()[1]) filein_gpu = logdir + "gputrace.csv" filein_cpu = logdir + "cputrace.csv" filein_net = logdir + "nettrace.csv" filein_vmstat = logdir + "vmstat.csv" filein_mpstat = logdir + "mpstat.csv" filein_strace = logdir + "strace.csv" if os.path.isfile('%s/nvlink_topo.txt' % logdir): with open(logdir + 'nvlink_topo.txt') as f: lines = f.readlines() if len(lines) > 0: title = lines[0] num_gpus = 1 for word in title.split(): if re.match(r'GPU', word) != None: num_gpus = num_gpus + 1 print_info(cfg, '# of GPUs: ' + str(num_gpus)) edges = [] if len(lines) >= num_gpus + 1: for i in range(num_gpus): connections = lines[1 + i].split() for j in range(len(connections)): if connections[j] == 'NV1' or connections[ j] == 'NV2': edges.append((i, j - 1)) #print('%d connects to %d' % (i, j-1)) ring_found = False G = nx.DiGraph(edges) # Try to find ring with its length of num_gpus for cycle in nx.simple_cycles(G): if len(cycle) == num_gpus: print(( "One of the recommended ring having length of %d" % len(cycle))) ring_found = True os.system("mkdir -p sofalog/sofa_hints/") xring_order = ','.join(map(str, cycle)) with open("sofalog/sofa_hints/xring_order.txt", "w") as f: f.write('export CUDA_VISIBLE_DEVICES=' + xring_order) break # Try to find ring with its length of num_gpus/2 if not ring_found: for cycle in nx.simple_cycles(G): if len(cycle) == num_gpus / 2: print(( "One of the recommended ring having length of %d" % len(cycle))) ring_found = True os.system("mkdir -p sofalog/sofa_hints/") xring_order = ','.join(map(str, cycle)) with open("sofalog/sofa_hints/xring_order.txt", "w") as f: f.write('export CUDA_VISIBLE_DEVICES=' + xring_order) break # Construct Performance Features features = pd.DataFrame( { 'name': ['elapsed_time'], 'value': [cfg.elapsed_time] }, columns=['name', 'value']) try: df_cpu = pd.read_csv(filein_cpu) cpu_profile(logdir, cfg, df_cpu) except IOError as e: df_cpu = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_cpu) try: df_strace = pd.read_csv(filein_strace) except IOError as e: df_strace = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_strace) try: df_net = pd.read_csv(filein_net) features = net_profile(logdir, cfg, df_net, features) except IOError as e: df_net = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_net) try: df_vmstat = pd.read_csv(filein_vmstat) features = vmstat_profile(logdir, cfg, df_vmstat, features) except IOError as e: df_vmstat = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_vmstat) try: df_mpstat = pd.read_csv(filein_mpstat) features = mpstat_profile(logdir, cfg, df_mpstat, features) except IOError as e: df_mpstat = pd.DataFrame([], columns=cfg.columns) print_warning("%s is not found" % filein_mpstat) try: df_gpu = pd.read_csv(filein_gpu) features = gpu_profile(logdir, cfg, df_gpu, features) except IOError: df_gpu = pd.DataFrame([], columns=cfg.columns) print_warning( "%s is not found. If there is no need to profile GPU, just ignore it." % filein_gpu) try: features = dynamic_top_down(logdir, cfg, df_mpstat, df_cpu, df_gpu, features) except IOError as e: print_warning( "Some files are not found, which are needed for dynamic_top_down analysis" ) if cfg.enable_aisi: selected_pattern, iter_summary = sofa_aisi(logdir, cfg, df_cpu, df_gpu, df_strace, df_mpstat) print_title('Final Performance Features') print('%s%s%s' % ('ID'.ljust(10), 'Feature'.ljust(30), 'Value'.ljust(20))) for i in range(len(features)): name = features.iloc[i]['name'] value = features.iloc[i]['value'] print('%s%s%s' % (str(i).ljust(10), name.ljust(30), ('%.3lf' % value).ljust(20))) if cfg.potato_server: print_title('POTATO Feedback') hint, docker_image = get_hint(features) print('Optimization hints: ' + hint) print('Tag of optimal image recommended from POTATO: ' + highlight(docker_image)) print('Please re-launch KubeFlow Jupyter-notebook with the new tag.') print('\n\n')
def sofa_analyze(logdir, cfg): filein = [] df_gpu = [] df_cpu = [] df_vmstat = [] filein_gpu = logdir + "gputrace.csv" filein_cpu = logdir + "cputrace.csv" filein_vmstat = logdir + "vmstat_trace.csv" if os.path.isfile('%s/nvlink_topo.txt' % logdir): with open(logdir + 'nvlink_topo.txt') as f: lines = f.readlines() if len(lines) > 0: title = lines[0] num_gpus = 1 for word in title.split(): if re.match(r'GPU', word) != None: num_gpus = num_gpus + 1 print_info('# of GPUs: ' + str(num_gpus)) edges = [] if len(lines) >= num_gpus + 1: for i in range(num_gpus): connections = lines[1 + i].split() for j in range(len(connections)): if connections[j] == 'NV1' or connections[ j] == 'NV2': edges.append((i, j - 1)) #print('%d connects to %d' % (i, j-1)) ring_found = False G = nx.DiGraph(edges) # Try to find ring with its length of num_gpus for cycle in nx.simple_cycles(G): if len(cycle) == num_gpus: print(( "One of the recommended ring having length of %d" % len(cycle))) ring_found = True os.system("mkdir -p sofalog/sofa_hints/") xring_order = ','.join(map(str, cycle)) with open("sofalog/sofa_hints/xring_order.txt", "w") as f: f.write('export CUDA_VISIBLE_DEVICES=' + xring_order) break # Try to find ring with its length of num_gpus/2 if not ring_found: for cycle in nx.simple_cycles(G): if len(cycle) == num_gpus / 2: print(( "One of the recommended ring having length of %d" % len(cycle))) ring_found = True os.system("mkdir -p sofalog/sofa_hints/") xring_order = ','.join(map(str, cycle)) with open("sofalog/sofa_hints/xring_order.txt", "w") as f: f.write('export CUDA_VISIBLE_DEVICES=' + xring_order) break try: df_cpu = pd.read_csv(filein_cpu) df_vmstat = pd.read_csv(filein_vmstat) cpu_profile(logdir, cfg, df_cpu) net_profile(logdir, cfg, df_cpu) vmstat_profile(logdir, cfg, df_vmstat) except IOError: print_warning("cputrace.csv is not found") #quit() try: df_gpu = pd.read_csv(filein_gpu) #df_gpu.loc[:, 'timestamp'] -= df_gpu.loc[0, 'timestamp'] gpu_profile(logdir, cfg, df_gpu) if cfg.enable_aisi: sofa_aisi(logdir, cfg, df_cpu, df_gpu) except IOError: print_warning( "gputrace.csv is not found. If there is no need to profile GPU, just ignore it." )