Example #1
0
def sofa_analyze(cfg):
    print_main_progress('SOFA analyzing...')
    filein = []
    df_cpu = pd.DataFrame([], columns=cfg.columns)
    df_gpu = pd.DataFrame([], columns=cfg.columns)
    df_net = pd.DataFrame([], columns=cfg.columns)
    df_mpstat = pd.DataFrame([], columns=cfg.columns)
    df_vmstat = pd.DataFrame([], columns=cfg.columns)
    df_bandwidth = pd.DataFrame([], columns=cfg.columns)
    df_blktrace = pd.DataFrame([], columns=cfg.columns)
    df_diskstat = pd.DataFrame([], columns=cfg.columns)
    df_nvsmi = pd.DataFrame([], columns=cfg.columns)
    iter_summary = None
    logdir = cfg.logdir

    with open(logdir+'/misc.txt') as f:
        lines = f.readlines()
        elapsed_time = float(lines[0].split()[1])
        vcores = int(lines[2].split()[1])
        cfg.elapsed_time = float(lines[0].split()[1])
    
    filein_gpu = logdir + "gputrace.csv"
    filein_cpu = logdir + "cputrace.csv"
    filein_net = logdir + "nettrace.csv"
    filein_vmstat = logdir + "vmstat.csv"
    filein_mpstat = logdir + "mpstat.csv"
    filein_strace = logdir + "strace.csv"
    filein_nvsmi = logdir + "nvsmi_trace.csv"
    filein_bandwidth = logdir + "netstat.csv"
    filein_blktrace = logdir + "blktrace.csv"
    filein_diskstat = logdir + "diskstat_vector.csv"

    if os.path.isfile('%s/nvlink_topo.txt' % logdir):

        with open(logdir + 'nvlink_topo.txt') as f:
            lines = f.readlines()
            if len(lines) > 0:
                title = lines[0]
                num_gpus = 1
                for word in title.split():
                    if re.match(r'GPU', word) != None :
                       num_gpus = num_gpus + 1
                print_info(cfg,'# of GPUs: ' + str(num_gpus) )
                edges = []
                if len(lines) >= num_gpus+1:
                    for i in range(num_gpus):
                        connections = lines[1+i].split()
                        for j in range(len(connections)):
                            if connections[j] == 'NV1' or connections[j] == 'NV2':
                                edges.append((i,j-1))
                                #print('%d connects to %d' % (i, j-1))

                    ring_found = False
                    G = nx.DiGraph(edges)
                    # Try to find ring with its length of num_gpus
                    for cycle in nx.simple_cycles(G):
                        if len(cycle) == num_gpus:
                            if cfg.verbose:
                                print('One of the recommended ring having length of %d' % len(cycle))
                            ring_found = True
                            os.system("mkdir -p sofalog/sofa_hints/")
                            xring_order = ','.join(map(str, cycle))
                            with open("sofalog/sofa_hints/xring_order.txt", "w") as f:
                                f.write('export CUDA_VISIBLE_DEVICES=' + xring_order)
                            break

                    # Try to find ring with its length of num_gpus/2
                    if not ring_found:
                        for cycle in nx.simple_cycles(G):
                            if len(cycle) == num_gpus/2:
                                print(("One of the recommended ring having length of %d" % len(cycle) ))
                                ring_found = True
                                os.system("mkdir -p sofalog/sofa_hints/")
                                xring_order = ','.join(map(str, cycle))
                                with open("sofalog/sofa_hints/xring_order.txt", "w") as f:
                                    f.write('export CUDA_VISIBLE_DEVICES=' + xring_order)
                                break
    # Construct Performance Features
    features = pd.DataFrame({'name':['elapsed_time'], 'value':[cfg.elapsed_time]}, columns=['name','value'])

    try:
        df_nvsmi = pd.read_csv(filein_nvsmi) 
        if not df_nvsmi.empty and cfg.spotlight_gpu:
            state = 0 
            sm_high = 0
            trigger = 10
            for i in range(len(df_nvsmi)):
                if df_nvsmi.iloc[i].event == 0 and df_nvsmi.iloc[i].deviceId == 0 :
                    if df_nvsmi.iloc[i].duration >= 50:
                        sm_high = min(trigger, sm_high + 1)
                    if df_nvsmi.iloc[i].duration < 10:
                        sm_high = max(0, sm_high - 1)
                    if state == 0 and sm_high == trigger:
                        state = 1 
                        cfg.roi_begin = df_nvsmi.iloc[i].timestamp
                    elif state == 1 and sm_high == 0:
                        state = 0 
                        cfg.roi_end = df_nvsmi.iloc[i].timestamp
                    #print('sm_high=%d state=%d' % (sm_high, state))
            if cfg.roi_end - cfg.roi_begin < 0:
                cfg.roi_end = 0
                cfg.roi_begin = 0
    except IOError:
        print_warning(cfg, "nvsmi_trace.csv is not found")

    try:
        df_cpu = pd.read_csv(filein_cpu)
        if not df_cpu.empty: 
            if cfg.verbose:
                cpu_profile(logdir, cfg, df_cpu)
            if cfg.enable_swarms and len(df_cpu) > cfg.num_swarms:
                df_cpu, swarms = hsg_v2(cfg, df_cpu)
    except IOError as e:
        df_cpu = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found" % filein_cpu)

    try:
        df_strace = pd.read_csv(filein_strace)
        if not df_strace.empty: 
            features = strace_profile(logdir, cfg, df_strace, features)
    except IOError as e:
        df_strace = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found" % filein_strace)

    try:
        df_net = pd.read_csv(filein_net)
        if not df_net.empty: 
            features = net_profile(logdir, cfg, df_net, features)
    except IOError as e:
        df_net = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found" % filein_net)

    try:
        df_bandwidth = pd.read_csv(filein_bandwidth)
        if not df_bandwidth.empty: 
            features = netbandwidth_profile(logdir, cfg, df_bandwidth, features)
    except IOError as e:
        df_bandwidth = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found" % filein_bandwidth)

    try:
        df_blktrace = pd.read_csv(filein_blktrace)
        if not df_blktrace.empty: 
            features = blktrace_latency_profile(logdir, cfg, df_blktrace, features)
    except IOError as e:
        df_blktrace = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found" % filein_blktrace)

    try:
        df_diskstat = pd.read_csv(filein_diskstat)
        if not df_diskstat.empty:
            features = diskstat_profile(logdir, cfg, df_diskstat, features)
    except IOError as e:
        df_diskstat = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found" % filein_diskstat)

    try:
        df_vmstat = pd.read_csv(filein_vmstat)
        if not df_vmstat.empty: 
            features = vmstat_profile(logdir, cfg, df_vmstat, features)
    except IOError as e:
        df_vmstat = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found" % filein_vmstat)

    try:
        df_mpstat = pd.read_csv(filein_mpstat)
        if not df_mpstat.empty: 
            features = mpstat_profile(logdir, cfg, df_mpstat, features)
    except IOError as e:
        df_mpstat = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found" % filein_mpstat)

    try:
        df_nvsmi = pd.read_csv(filein_nvsmi) 
        features = nvsmi_profile(logdir, cfg, df_nvsmi, features)
    except IOError:
        print_warning(cfg, "nvsmi_trace.csv is not found")

    try:
        df_gpu = pd.read_csv(filein_gpu)
        if not df_gpu.empty: 
            features = gpu_profile(logdir, cfg, df_gpu, features)
    except IOError:
        df_gpu = pd.DataFrame([], columns=cfg.columns)
        print_warning(cfg, "%s is not found. If there is no need to profile GPU, just ignore it." % filein_gpu)

    try:
        if len(df_mpstat)>0:
            df_nvsmi.append(df_mpstat.iloc[0])
            features = concurrency_breakdown(logdir, cfg, df_mpstat, df_cpu, df_gpu, df_nvsmi, df_bandwidth, features)
    except IOError as e:
        print_warning(cfg, "Some files are not found, which are needed for concurrency_breakdown analysis")





    if cfg.enable_aisi:
        selected_pattern, iter_summary, features = sofa_aisi(logdir, cfg, df_cpu, df_gpu, df_strace, df_mpstat, features)           

    if 'IS_SOFA_ON_HAIHUB' not in os.environ or os.environ['IS_SOFA_ON_HAIHUB'] == 'no': 
            print_title('Final Performance Features')
            print('%s%s%s%s' % ('ID'.ljust(10),'Feature'.ljust(30),'Value'.ljust(20),'Unit'.ljust(20)) )
            for i in range(len(features)):
                name = features.iloc[i]['name']
                value = features.iloc[i]['value']
                print('%s%s%s' % (str(i).ljust(10), name.ljust(30), ('%.3lf'%value).ljust(20)))

    if cfg.spotlight_gpu:
        try:
            print('Elapsed hotspot time: %.3lf' % features[features.name=='elapsed_hotspot_time'].value)
        except:
            print_warning(cfg, 'elpased_hostspot_time is not defined.')
 
    if cfg.potato_server:
        if cfg.potato_server.find(':') == -1:
            cfg.potato_server = cfg.potato_server + ':50051'
        hint, docker_image = get_hint(cfg.potato_server, features)
        df_report = pd.read_json(hint, orient='table')
        file_potato_report = cfg.logdir + 'potato_report.html'
        
        # Export report to HTML file.
        df_report.to_html(file_potato_report )
        with open(file_potato_report, 'a') as f:
            f.write('<head><link rel=stylesheet type="text/css" href="potato_report.css"></head>')

        print_title('POTATO Feedback')
        print('%s%s%s%s' % ('ID'.ljust(5), 'Metric'.ljust(20), 'Value'.ljust(10), 'Reference-Value'.ljust(30) ) )
        for i in range(len(df_report)):
            metric = df_report.iloc[i]['Metric']
            if metric != 'hybrid_suggestion':
                value = df_report.iloc[i]['Value']
                ref_value = df_report.iloc[i]['ReferenceValue']
                print('%s%s%s%s' % (str(i).ljust(5), metric.ljust(20), ('%.3lf'%value).ljust(20), str(ref_value).ljust(30)))
 
        print('\n')
        print_hint('General Suggestions:')
        for i in range(len(df_report)):
            metric = df_report.iloc[i]['Metric']
            if metric != 'hybrid_suggestion':
                suggestion = df_report.iloc[i]['Suggestion']
                print('%d. %s' % (i, suggestion))
        
        print('\n')
        print_hint('Framework-specific Optimization Suggestions:')
        for i in range(len(df_report)):
            metric = df_report.iloc[i]['Metric']
            if metric == 'hybrid_suggestion':
                suggestion = df_report.iloc[i]['Suggestion']
                print('%d. %s' % (i, suggestion))
        
        #print(df_report[['Metric', 'Value', 'Reference Value']])
        #print(df_report[['Suggestion']])
        #print('Tag of optimal image recommended from POTATO: ' + highlight(docker_image))
        print('\n')
        print_hint('Please re-launch KubeFlow Jupyter-notebook to have suggested images or resources if necessary.')
    
    sofa_home = os.path.dirname(os.path.realpath(__file__))
    subprocess.Popen(
        ['bash', '-c', 'cp %s/../sofaboard/* %s;' % (sofa_home, cfg.logdir)])
    subprocess.Popen(['sleep', '2'])
    print('\n\n')
    print('Complete!!')
Example #2
0
def sofa_analyze(cfg):
    filein = []
    df_cpu = pd.DataFrame([], columns=cfg.columns)
    df_gpu = pd.DataFrame([], columns=cfg.columns)
    df_net = pd.DataFrame([], columns=cfg.columns)
    df_mpstat = pd.DataFrame([], columns=cfg.columns)
    df_vmstat = pd.DataFrame([], columns=cfg.columns)
    df_bandwidth = pd.DataFrame([], columns=cfg.columns)
    df_blktrace = pd.DataFrame([], columns=cfg.columns)
    df_diskstat = pd.DataFrame([], columns=cfg.columns)
    df_nvsmi = pd.DataFrame([], columns=cfg.columns)
    iter_summary = None
    logdir = cfg.logdir

    with open(logdir+'/misc.txt') as f:
        lines = f.readlines()
        elapsed_time = float(lines[0].split()[1])
        vcores = int(lines[2].split()[1])
        cfg.elapsed_time = float(lines[0].split()[1])
    
    filein_gpu = logdir + "gputrace.csv"
    filein_cpu = logdir + "cputrace.csv"
    filein_net = logdir + "nettrace.csv"
    filein_vmstat = logdir + "vmstat.csv"
    filein_mpstat = logdir + "mpstat.csv"
    filein_strace = logdir + "strace.csv"
    filein_nvsmi = logdir + "nvsmi_trace.csv"
    filein_bandwidth = logdir + "netstat.csv"
    filein_blktrace = logdir + "blktrace.csv"
    filein_diskstat = logdir + "diskstat.csv"

    if os.path.isfile('%s/nvlink_topo.txt' % logdir):

        with open(logdir + 'nvlink_topo.txt') as f:
            lines = f.readlines()
            if len(lines) > 0:
                title = lines[0]
                num_gpus = 1
                for word in title.split():
                    if re.match(r'GPU', word) != None :
                       num_gpus = num_gpus + 1
                print_info(cfg,'# of GPUs: ' + str(num_gpus) )
                edges = []
                if len(lines) >= num_gpus+1:
                    for i in range(num_gpus):
                        connections = lines[1+i].split()
                        for j in range(len(connections)):
                            if connections[j] == 'NV1' or connections[j] == 'NV2':
                                edges.append((i,j-1))
                                #print('%d connects to %d' % (i, j-1))

                    ring_found = False
                    G = nx.DiGraph(edges)
                    # Try to find ring with its length of num_gpus
                    for cycle in nx.simple_cycles(G):
                        if len(cycle) == num_gpus:
                            print(("One of the recommended ring having length of %d" % len(cycle) ))
                            ring_found = True
                            os.system("mkdir -p sofalog/sofa_hints/")
                            xring_order = ','.join(map(str, cycle))
                            with open("sofalog/sofa_hints/xring_order.txt", "w") as f:
                                f.write('export CUDA_VISIBLE_DEVICES=' + xring_order)
                            break

                    # Try to find ring with its length of num_gpus/2
                    if not ring_found:
                        for cycle in nx.simple_cycles(G):
                            if len(cycle) == num_gpus/2:
                                print(("One of the recommended ring having length of %d" % len(cycle) ))
                                ring_found = True
                                os.system("mkdir -p sofalog/sofa_hints/")
                                xring_order = ','.join(map(str, cycle))
                                with open("sofalog/sofa_hints/xring_order.txt", "w") as f:
                                    f.write('export CUDA_VISIBLE_DEVICES=' + xring_order)
                                break
    # Construct Performance Features
    features = pd.DataFrame({'name':['elapsed_time'], 'value':[cfg.elapsed_time]}, columns=['name','value'])

    try:
        df_nvsmi = pd.read_csv(filein_nvsmi) 
        if not df_nvsmi.empty and cfg.spotlight_gpu:
            state = 0 
            sm_high = 0
            trigger = 10
            for i in range(len(df_nvsmi)):
                if df_nvsmi.iloc[i].event == 0 and df_nvsmi.iloc[i].deviceId == 0 :
                    if df_nvsmi.iloc[i].duration >= 50:
                        sm_high = min(trigger, sm_high + 1)
                    if df_nvsmi.iloc[i].duration < 10:
                        sm_high = max(0, sm_high - 1)
                    if state == 0 and sm_high == trigger:
                        state = 1 
                        cfg.roi_begin = df_nvsmi.iloc[i].timestamp
                    elif state == 1 and sm_high == 0:
                        state = 0 
                        cfg.roi_end = df_nvsmi.iloc[i].timestamp
                    #print('sm_high=%d state=%d' % (sm_high, state))
            if cfg.roi_end - cfg.roi_begin < 0:
                cfg.roi_end = 0
                cfg.roi_begin = 0
    except IOError:
        print_warning("nvsmi_trace.csv is not found")

    try:
        df_cpu = pd.read_csv(filein_cpu)
        if not df_cpu.empty: 
            cpu_profile(logdir, cfg, df_cpu)
            df_cpu, swarms = hsg_v2(cfg, df_cpu)
    except IOError as e:
        df_cpu = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_cpu)

    try:
        df_strace = pd.read_csv(filein_strace)
        if not df_strace.empty: 
            features = strace_profile(logdir, cfg, df_strace, features)
    except IOError as e:
        df_strace = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_strace)

    try:
        df_net = pd.read_csv(filein_net)
        if not df_net.empty: 
            features = net_profile(logdir, cfg, df_net, features)
    except IOError as e:
        df_net = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_net)

    try:
        df_bandwidth = pd.read_csv(filein_bandwidth)
        if not df_bandwidth.empty: 
            features = netbandwidth_profile(logdir, cfg, df_bandwidth, features)
    except IOError as e:
        df_bandwidth = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_bandwidth)

    try:
        df_blktrace = pd.read_csv(filein_blktrace)
        if not df_blktrace.empty: 
            print(df_blktrace)
            features = blktrace_latency_profile(logdir, cfg, df_blktrace, features)
    except IOError as e:
        df_blktrace = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_blktrace)

    try:
        df_diskstat = pd.read_csv(filein_diskstat)
        if not df_diskstat.empty:
            features = diskstat_profile(logdir, cfg, df_diskstat, features)
    except IOError as e:
        df_diskstat = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_diskstat)

    try:
        df_vmstat = pd.read_csv(filein_vmstat)
        if not df_vmstat.empty: 
            features = vmstat_profile(logdir, cfg, df_vmstat, features)
    except IOError as e:
        df_vmstat = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_vmstat)

    try:
        df_mpstat = pd.read_csv(filein_mpstat)
        if not df_mpstat.empty: 
            features = mpstat_profile(logdir, cfg, df_mpstat, features)
    except IOError as e:
        df_mpstat = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_mpstat)

    try:
        df_nvsmi = pd.read_csv(filein_nvsmi) 
        features = nvsmi_profile(logdir, cfg, df_nvsmi, features)
    except IOError:
        print_warning("nvsmi_trace.csv is not found")

    try:
        df_gpu = pd.read_csv(filein_gpu)
        if not df_gpu.empty: 
            features = gpu_profile(logdir, cfg, df_gpu, features)
    except IOError:
        df_gpu = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found. If there is no need to profile GPU, just ignore it." % filein_gpu)

    try:
        if df_nvsmi.empty:
            df_nvsmi.append(df_mpstat.iloc[0])
        features = dynamic_top_down(logdir, cfg, df_mpstat, df_cpu, df_gpu, df_nvsmi, df_bandwidth, features)
    except IOError as e:
        print_warning("Some files are not found, which are needed for dynamic_top_down analysis")



    if cfg.enable_aisi:
        selected_pattern, iter_summary = sofa_aisi(logdir, cfg, df_cpu, df_gpu, df_strace, df_mpstat)
                    
    print_title('Final Performance Features')
    print('%s%s%s' % ('ID'.ljust(10),'Feature'.ljust(30),'Value'.ljust(20)) )
    
    for i in range(len(features)):
        name = features.iloc[i]['name']
        value = features.iloc[i]['value']
        print('%s%s%s' % (str(i).ljust(10), name.ljust(30), ('%.3lf'%value).ljust(20)))

    if cfg.potato_server:
        print_title('POTATO Feedback')
        if cfg.potato_server.find(':') == -1:
            cfg.potato_server = cfg.potato_server + ':50051'
        hint, docker_image = get_hint(cfg.potato_server, features)
        print('Optimization hints: \n')
        df_report = pd.read_json(hint, orient='table')
        print(df_report)
        file_potato_report = cfg.logdir + 'potato_report.html'
        df_report.to_html(file_potato_report )
        with open(file_potato_report, 'a') as f:
            f.write('<head><link rel=stylesheet type="text/css" href="potato_report.css"></head>')
        print('Tag of optimal image recommended from POTATO: ' + highlight(docker_image))
        print('Please re-launch KubeFlow Jupyter-notebook with the new tag.')
    
    print('\n\n')
Example #3
0
def sofa_analyze(cfg):
    filein = []
    df_cpu = pd.DataFrame([], columns=cfg.columns)
    df_gpu = pd.DataFrame([], columns=cfg.columns)
    df_net = pd.DataFrame([], columns=cfg.columns)
    df_mpstat = pd.DataFrame([], columns=cfg.columns)
    df_vmstat = pd.DataFrame([], columns=cfg.columns)
    iter_summary = None
    logdir = cfg.logdir

    with open(logdir + '/misc.txt') as f:
        lines = f.readlines()
        elapsed_time = float(lines[0].split()[1])
        vcores = int(lines[2].split()[1])
        cfg.elapsed_time = float(lines[0].split()[1])

    filein_gpu = logdir + "gputrace.csv"
    filein_cpu = logdir + "cputrace.csv"
    filein_net = logdir + "nettrace.csv"
    filein_vmstat = logdir + "vmstat.csv"
    filein_mpstat = logdir + "mpstat.csv"
    filein_strace = logdir + "strace.csv"

    if os.path.isfile('%s/nvlink_topo.txt' % logdir):

        with open(logdir + 'nvlink_topo.txt') as f:
            lines = f.readlines()
            if len(lines) > 0:
                title = lines[0]
                num_gpus = 1
                for word in title.split():
                    if re.match(r'GPU', word) != None:
                        num_gpus = num_gpus + 1
                print_info(cfg, '# of GPUs: ' + str(num_gpus))
                edges = []
                if len(lines) >= num_gpus + 1:
                    for i in range(num_gpus):
                        connections = lines[1 + i].split()
                        for j in range(len(connections)):
                            if connections[j] == 'NV1' or connections[
                                    j] == 'NV2':
                                edges.append((i, j - 1))
                                #print('%d connects to %d' % (i, j-1))

                    ring_found = False
                    G = nx.DiGraph(edges)
                    # Try to find ring with its length of num_gpus
                    for cycle in nx.simple_cycles(G):
                        if len(cycle) == num_gpus:
                            print((
                                "One of the recommended ring having length of %d"
                                % len(cycle)))
                            ring_found = True
                            os.system("mkdir -p sofalog/sofa_hints/")
                            xring_order = ','.join(map(str, cycle))
                            with open("sofalog/sofa_hints/xring_order.txt",
                                      "w") as f:
                                f.write('export CUDA_VISIBLE_DEVICES=' +
                                        xring_order)
                            break

                    # Try to find ring with its length of num_gpus/2
                    if not ring_found:
                        for cycle in nx.simple_cycles(G):
                            if len(cycle) == num_gpus / 2:
                                print((
                                    "One of the recommended ring having length of %d"
                                    % len(cycle)))
                                ring_found = True
                                os.system("mkdir -p sofalog/sofa_hints/")
                                xring_order = ','.join(map(str, cycle))
                                with open("sofalog/sofa_hints/xring_order.txt",
                                          "w") as f:
                                    f.write('export CUDA_VISIBLE_DEVICES=' +
                                            xring_order)
                                break
    # Construct Performance Features
    features = pd.DataFrame(
        {
            'name': ['elapsed_time'],
            'value': [cfg.elapsed_time]
        },
        columns=['name', 'value'])

    try:
        df_cpu = pd.read_csv(filein_cpu)
        cpu_profile(logdir, cfg, df_cpu)
    except IOError as e:
        df_cpu = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_cpu)

    try:
        df_strace = pd.read_csv(filein_strace)
    except IOError as e:
        df_strace = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_strace)

    try:
        df_net = pd.read_csv(filein_net)
        features = net_profile(logdir, cfg, df_net, features)
    except IOError as e:
        df_net = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_net)

    try:
        df_vmstat = pd.read_csv(filein_vmstat)
        features = vmstat_profile(logdir, cfg, df_vmstat, features)
    except IOError as e:
        df_vmstat = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_vmstat)

    try:
        df_mpstat = pd.read_csv(filein_mpstat)
        features = mpstat_profile(logdir, cfg, df_mpstat, features)
    except IOError as e:
        df_mpstat = pd.DataFrame([], columns=cfg.columns)
        print_warning("%s is not found" % filein_mpstat)

    try:
        df_gpu = pd.read_csv(filein_gpu)
        features = gpu_profile(logdir, cfg, df_gpu, features)
    except IOError:
        df_gpu = pd.DataFrame([], columns=cfg.columns)
        print_warning(
            "%s is not found. If there is no need to profile GPU, just ignore it."
            % filein_gpu)

    try:
        features = dynamic_top_down(logdir, cfg, df_mpstat, df_cpu, df_gpu,
                                    features)
    except IOError as e:
        print_warning(
            "Some files are not found, which are needed for dynamic_top_down analysis"
        )

    if cfg.enable_aisi:
        selected_pattern, iter_summary = sofa_aisi(logdir, cfg, df_cpu, df_gpu,
                                                   df_strace, df_mpstat)

    print_title('Final Performance Features')
    print('%s%s%s' % ('ID'.ljust(10), 'Feature'.ljust(30), 'Value'.ljust(20)))

    for i in range(len(features)):
        name = features.iloc[i]['name']
        value = features.iloc[i]['value']
        print('%s%s%s' % (str(i).ljust(10), name.ljust(30),
                          ('%.3lf' % value).ljust(20)))

    if cfg.potato_server:
        print_title('POTATO Feedback')
        hint, docker_image = get_hint(features)
        print('Optimization hints: ' + hint)
        print('Tag of optimal image recommended from POTATO: ' +
              highlight(docker_image))
        print('Please re-launch KubeFlow Jupyter-notebook with the new tag.')

    print('\n\n')
Example #4
0
def sofa_analyze(logdir, cfg):
    filein = []
    df_gpu = []
    df_cpu = []
    df_vmstat = []

    filein_gpu = logdir + "gputrace.csv"
    filein_cpu = logdir + "cputrace.csv"
    filein_vmstat = logdir + "vmstat_trace.csv"

    if os.path.isfile('%s/nvlink_topo.txt' % logdir):

        with open(logdir + 'nvlink_topo.txt') as f:
            lines = f.readlines()
            if len(lines) > 0:
                title = lines[0]
                num_gpus = 1
                for word in title.split():
                    if re.match(r'GPU', word) != None:
                        num_gpus = num_gpus + 1
                print_info('# of GPUs: ' + str(num_gpus))
                edges = []
                if len(lines) >= num_gpus + 1:
                    for i in range(num_gpus):
                        connections = lines[1 + i].split()
                        for j in range(len(connections)):
                            if connections[j] == 'NV1' or connections[
                                    j] == 'NV2':
                                edges.append((i, j - 1))
                                #print('%d connects to %d' % (i, j-1))

                    ring_found = False
                    G = nx.DiGraph(edges)
                    # Try to find ring with its length of num_gpus
                    for cycle in nx.simple_cycles(G):
                        if len(cycle) == num_gpus:
                            print((
                                "One of the recommended ring having length of %d"
                                % len(cycle)))
                            ring_found = True
                            os.system("mkdir -p sofalog/sofa_hints/")
                            xring_order = ','.join(map(str, cycle))
                            with open("sofalog/sofa_hints/xring_order.txt",
                                      "w") as f:
                                f.write('export CUDA_VISIBLE_DEVICES=' +
                                        xring_order)
                            break

                    # Try to find ring with its length of num_gpus/2
                    if not ring_found:
                        for cycle in nx.simple_cycles(G):
                            if len(cycle) == num_gpus / 2:
                                print((
                                    "One of the recommended ring having length of %d"
                                    % len(cycle)))
                                ring_found = True
                                os.system("mkdir -p sofalog/sofa_hints/")
                                xring_order = ','.join(map(str, cycle))
                                with open("sofalog/sofa_hints/xring_order.txt",
                                          "w") as f:
                                    f.write('export CUDA_VISIBLE_DEVICES=' +
                                            xring_order)
                                break
    try:
        df_cpu = pd.read_csv(filein_cpu)
        df_vmstat = pd.read_csv(filein_vmstat)
        cpu_profile(logdir, cfg, df_cpu)
        net_profile(logdir, cfg, df_cpu)
        vmstat_profile(logdir, cfg, df_vmstat)
    except IOError:
        print_warning("cputrace.csv is not found")
        #quit()

    try:
        df_gpu = pd.read_csv(filein_gpu)
        #df_gpu.loc[:, 'timestamp'] -= df_gpu.loc[0, 'timestamp']
        gpu_profile(logdir, cfg, df_gpu)
        if cfg.enable_aisi:
            sofa_aisi(logdir, cfg, df_cpu, df_gpu)
    except IOError:
        print_warning(
            "gputrace.csv is not found. If there is no need to profile GPU, just ignore it."
        )