def Setup(tc): api.Logger.info("RDMA Driver LIF Reset") tc.nodes = api.GetNaplesHostnames() tc.os = api.GetNodeOs(tc.nodes[0]) if tc.os not in [host.OS_TYPE_BSD, host.OS_TYPE_LINUX]: api.Logger.info("Not implemented") return api.types.status.IGNORED tc.stg1 = {} tc.stg2 = {} tc.intfs = [] for node in tc.nodes: for intf in api.GetNaplesHostInterfaces(node): tc.intfs.append((node,intf)) if tc.os == host.OS_TYPE_LINUX: # Cache this to save time tc.pci = {} for node in tc.nodes: for intf in api.GetNaplesHostInterfaces(node): pci = host.GetNaplesPci(node, intf) if pci is None: api.Logger.warn("%s %s couldn't find PCI device" % ( node, intf)) tc.pci[(node,intf)] = pci return api.types.status.SUCCESS
def Trigger(tc): names = api.GetNaplesHostnames() hostname = names[0] if api.GetNodeOs(hostname) != host.OS_TYPE_LINUX: return api.types.status.SUCCESS for intf in api.GetNaplesHostInterfaces(hostname): api.Logger.info("Checking event queue use on host %s interface %s" % (hostname, intf)) pci = host.GetNaplesPci(hostname, intf) if pci is None: return api.types.status.FAILURE # get eth_eq_count and number of eq interrupts req = api.Trigger_CreateExecuteCommandsRequest(serial=True) cmd = "awk '/eth_eq_count/ {print $2}' < /sys/kernel/debug/ionic/" + pci + "/identity" api.Trigger_AddHostCommand(req, hostname, cmd) cmd = "grep -c -e 'ionic-" + pci + "-eq' /proc/interrupts" api.Trigger_AddHostCommand(req, hostname, cmd) resp = api.Trigger(req) if resp is None: api.Logger.error("Failed to get values from host %s interface %s" % (hostname, intf)) return api.types.status.FAILURE cmd = resp.commands.pop() if cmd.exit_code > 1: # exit code 1 from grep is "string not found", which is a valid answer here api.Logger.error( "Failed to get eth_eq_count from host %s interface %s" % (hostname, intf)) api.PrintCommandResults(cmd) return api.types.status.FAILURE eth_eq_count = int(cmd.stdout.strip()) cmd = resp.commands.pop() if cmd.exit_code != 0: api.Logger.error( "Failed to get interrupt count from host %s interface %s" % (hostname, intf)) api.PrintCommandResults(cmd) return api.types.status.FAILURE intr_count = int(cmd.stdout.strip()) api.Logger.info( "Found eth_eq_count %d and interrupt count %d from host %s interface %s" % (eth_eq_count, intr_count, hostname, intf)) if eth_eq_count == 0 and intr_count != 0: api.Logger.error("eq interrupts found when eth_eq_count == 0") return api.types.status.FAILURE elif eth_eq_count != 0 and intr_count == 0: api.Logger.error("No eq interrupts found when eth_eq_count != 0") return api.types.status.FAILURE return api.types.status.SUCCESS
def grep_qps(tc): req = api.Trigger_CreateExecuteCommandsRequest(serial=True) for n in tc.nodes: for intf in api.GetNaplesHostInterfaces(n): if tc.os == host.OS_TYPE_LINUX: pci = host.GetNaplesPci(n, intf) if pci is None: continue else: sysctl = host.GetNaplesSysctl(intf) if tc.os == host.OS_TYPE_LINUX: cmd = ( "grep qpid /sys/kernel/debug/ionic/{}/lif0/rdma/qp/*/info". format(pci)) else: cmd = ("sysctl dev.{}.rdma.qp | grep qpid".format(sysctl)) api.Trigger_AddHostCommand(req, n, cmd) resp = api.Trigger(req) for cmd in resp.commands: api.PrintCommandResults(cmd)
def Trigger(tc): #============================================================== # trigger the commands #============================================================== req = api.Trigger_CreateExecuteCommandsRequest(serial = True) if tc.os != 'freebsd': api.Logger.info("Not FreeBSD - unsupported configuration") return api.types.status.DISABLED if tc.w[0].IsNaples(): if tc.w[1].IsNaples(): tc.server_idx = 2 tc.client_idx = 3 else: tc.server_idx = 3 tc.client_idx = 2 w1 = tc.w[0] w2 = tc.w[1] else: if tc.w[1].IsNaples(): tc.server_idx = 2 tc.client_idx = 3 else: api.Logger.info("No naples - unsupported configuration") return api.types.status.DISABLED w1 = tc.w[1] w2 = tc.w[0] ws = tc.w[tc.server_idx] wc = tc.w[tc.client_idx] if hasattr(tc.args, 'class_type'): tc.class_type = int(getattr(tc.args, 'class_type')) if tc.class_type != 1 and tc.class_type != 2: api.Logger.error("invalid class_type passed: {}".format(tc.class_type)) return api.types.status.FAILURE else: api.Logger.error("mandatory argument class_type not passed") return api.types.status.FAILURE num_rdma_cps = getattr(tc.args, 'num_rdma_cps', 0) rdma_cps = getattr(tc.args, 'rdma_cps', None) if num_rdma_cps != 0 and rdma_cps == None: api.Logger.error("num_rdma_cps is non zero but no rdma cps passed") return api.types.status.FAILURE num_iperf_cps = getattr(tc.args, 'num_iperf_cps', 0) iperf_cps = getattr(tc.args, 'iperf_cps', None) if num_iperf_cps != 0 and iperf_cps == None: api.Logger.error("num_iperf_cps is non zero but no iperf cps passed") return api.types.status.FAILURE # Run iperf tests first. All in background. for i_iperf in range(num_iperf_cps): iperf_cp = iperf_cps[i_iperf] qos.TriggerTrafficTest(req, tc, ws, wc, 2, iperf_cp, True) # Run the RDMA tests. All in background except the last one. for i_rdma in range(num_rdma_cps-1): rdma_cp = rdma_cps[i_rdma] qos.TriggerTrafficTest(req, tc, ws, wc, 1, rdma_cp, True) # Run the last RDMA test in the foreground if num_rdma_cps != 0: rdma_cp = rdma_cps[num_rdma_cps-1] qos.TriggerTrafficTest(req, tc, ws, wc, 1, rdma_cp, False) # print the next_qpid for w in [w1, w2]: if not w.IsNaples(): continue if tc.os == host.OS_TYPE_BSD: api.Logger.info("{}".format(w.interface)) cmd = 'sysctl dev.' + host.GetNaplesSysctl(w.interface) + '.rdma.info.next_qpid' elif tc.os == host.OS_TYPE_LINUX: pci = host.GetNaplesPci(w.node_name, w.interface) if pci is None: continue cmd = 'grep next_qpid /sys/kernel/debug/ionic/' + pci + '/lif0/rdma/info' else: continue api.Trigger_AddCommand(req, w.node_name, w.workload_name, cmd) tc.cmd_cookies.append(cmd) # Sleep for a while for all the tests to complete cmd = 'sleep 5' api.Trigger_AddCommand(req, ws.node_name, ws.workload_name, cmd) tc.cmd_cookies.append(cmd) #============================================================== # trigger the request #============================================================== trig_resp = api.Trigger(req) term_resp = api.Trigger_TerminateAllCommands(trig_resp) tc.resp = api.Trigger_AggregateCommandsResponse(trig_resp, term_resp) return api.types.status.SUCCESS
def Trigger(tc): #============================================================== # trigger the commands #============================================================== req = api.Trigger_CreateExecuteCommandsRequest(serial=True) # Populate bw lookup table - manual entry to speed up development bw_dict = {} bw_dict[(1, 4096)] = 10 bw_dict[(1, 8192)] = 10 bw_dict[(1, 65536)] = 50 bw_dict[(2, 4000)] = 10 bw_dict[(2, 4096)] = 10 bw_dict[(2, 8192)] = 10 bw_dict[(2, 16384)] = 10 bw_dict[(2, 32768)] = 30 bw_dict[(2, 65536)] = 50 bw_dict[(2, 8000)] = 10 bw_dict[(2, 16000)] = 10 bw_dict[(2, 32000)] = 30 bw_dict[(2, 64000)] = 50 bw_dict[(3, 4095)] = 5 bw_dict[(3, 3072)] = 5 bw_dict[(3, 3000)] = 5 bw_dict[(3, 12288)] = 10 bw_dict[(3, 24576)] = 20 bw_dict[(3, 12000)] = 10 bw_dict[(3, 24000)] = 20 bw_dict[(4, 4000)] = 5 bw_dict[(4, 4096)] = 5 bw_dict[(4, 8192)] = 10 bw_dict[(4, 16384)] = 10 bw_dict[(4, 32768)] = 30 bw_dict[(4, 65536)] = 50 bw_dict[(4, 16000)] = 10 bw_dict[(4, 32000)] = 30 bw_dict[(4, 64000)] = 50 bw_dict[(5, 20480)] = 20 bw_dict[(5, 20000)] = 10 bw_dict[(5, 10000)] = 5 bw_dict[(6, 12288)] = 10 bw_dict[(6, 24576)] = 20 bw_dict[(6, 24000)] = 20 bw_dict[(7, 28672)] = 20 bw_dict[(7, 28000)] = 30 bw_dict[(7, 7700)] = 4 bw_dict[(8, 16384)] = 5 bw_dict[(8, 32768)] = 10 bw_dict[(8, 65536)] = 10 bw_dict[(8, 32000)] = 10 bw_dict[(8, 64000)] = 10 #============================================================== # init cmd options #============================================================== iter_opt = ' -n 10 ' misc_opt = ' -F --report_gbits ' cm_opt = '' enable_dcqcn = False transport_opt = '' msg_size = 65536 size_opt = ' -a ' mtu_opt = ' -m 4096 ' qp_opt = '' numsges_opt = '' bidir_opt = '' rxdepth_opt = '' txdepth_opt = '' atomic_opt = '' tc.client_bkg = False s_port = 12340 e_port = s_port + 1 server_idx = 0 client_idx = 1 bkg_timeout = 130 sq_drain_opt = '' async_event_stats_opt = '' bw_opt = '' port_flap = False tc.tcpdump = False #============================================================== # update non-default cmd options #============================================================== # if use both duration '-D' and count '-n', count will take precedence if hasattr(tc.iterators, 'duration'): iter_opt = ' -D {} '.format(tc.iterators.duration) # For scale tests, we noticed all 8 threads not started early, # so need to give extra timeout bkg_timeout = tc.iterators.duration + 60 if hasattr(tc.iterators, 'count'): iter_opt = ' -n {} '.format(tc.iterators.count) if getattr(tc.iterators, 'rdma_cm', None) == 'yes': cm_opt = ' -R ' if getattr(tc.iterators, 'transport', None) == 'UD': transport_opt = ' -c UD ' if hasattr(tc.iterators, 'size'): msg_size = int(tc.iterators.size) size_opt = ' -s {} '.format(msg_size) if hasattr(tc.iterators, 'mtu'): mtu_opt = ' -m {} '.format(tc.iterators.mtu) numsges = getattr(tc.iterators, 'numsges', 1) if numsges > 1: numsges_opt = ' -W {} '.format(numsges) num_qp = getattr(tc.iterators, 'num_qp', 1) if num_qp > 1: qp_opt = ' -q {} '.format(num_qp) num_threads = getattr(tc.iterators, 'threads', 1) if num_threads > 1: tc.client_bkg = True e_port = s_port + tc.iterators.threads if getattr(tc.iterators, 'server', None) == 'no': server_idx = 1 client_idx = 0 if getattr(tc.iterators, 'bidir', None) == 'yes': bidir_opt = ' -b ' if hasattr(tc.iterators, 'rxdepth'): rxdepth_opt = ' -r {} '.format(tc.iterators.rxdepth) if hasattr(tc.iterators, 'txdepth'): txdepth_opt = ' -t {} '.format(tc.iterators.txdepth) if getattr(tc.iterators, 'cmp_swp', None) == 'yes': atomic_opt = ' -A CMP_AND_SWAP ' if getattr(tc.iterators, 'enable_dcqcn', None) == 'yes': enable_dcqcn = True if getattr(tc.iterators, 'sq_drain', None) == 'yes': sq_drain_opt = ' --sq-drain ' if getattr(tc.iterators, 'async_event_stats', None) == 'yes': async_event_stats_opt = ' --report-async-ev-stats ' if getattr(tc.iterators, 'check_bw', None) == 'yes' and \ num_qp == 1 and \ (numsges, msg_size) in bw_dict: bw_opt = ' -w {} '.format( math.ceil(bw_dict[(numsges, msg_size)] / num_threads)) if getattr(tc.iterators, 'port_flap', None) == 'true' and \ hasattr(tc.iterators, 'duration'): port_flap = True tc.client_bkg = True if getattr(tc.iterators, 'tcpdump', None) == 'yes' and \ not hasattr(tc.iterators, 'duration'): tc.tcpdump = True iter_opt = ' -n 5 ' #============================================================== # run the cmds #============================================================== w1 = tc.w[server_idx] w2 = tc.w[client_idx] tc.cmd_descr = "Server: %s(%s) <--> Client: %s(%s)" %\ (w1.workload_name, w1.ip_address, w2.workload_name, w2.ip_address) api.Logger.info("Starting %s test from %s" % (tc.iterators.command, tc.cmd_descr)) # Enable rdma sniffer and start tcpdump on Naples Hosts if tc.tcpdump == True: for w in [w1, w2]: if not w.IsNaples(): continue tcpdump_intf = w.interface.split('.')[ 0] # Get the parent interface tcpdump_cmd = "sudo tcpdump -l --immediate-mode -i {} -XXX udp dst port 4791 -w rdma_capture.pcap &".format( tcpdump_intf) if tc.os == host.OS_TYPE_BSD: sniffer_cmd = 'sysctl dev.' + host.GetNaplesSysctl( w.interface) + '.rdma_sniffer=1' elif tc.os == host.OS_TYPE_LINUX: sniffer_cmd = 'sudo ethtool --set-priv-flags ' + tcpdump_intf + ' rdma-sniffer on' else: continue api.Trigger_AddCommand(req, w.node_name, w.workload_name, sniffer_cmd) api.Trigger_AddCommand(req, w.node_name, w.workload_name, tcpdump_cmd, background=True) if enable_dcqcn == True: for w in [w1, w2]: if not w.IsNaples(): continue if tc.os == host.OS_TYPE_BSD: cmd = 'sysctl sys.class.infiniband.' + host.GetNaplesSysClassSysctl( w.interface) + '.dcqcn.match_default="1"' elif tc.os == host.OS_TYPE_LINUX: cmd = 'echo 1 > /sys/class/infiniband/' + host.GetNaplesSysClassSysctl( w.interface) + '/dcqcn/match_default' else: continue api.Trigger_AddCommand(req, w.node_name, w.workload_name, cmd, timeout=120) #============================================================== # cmd for server #============================================================== for p in range(s_port, e_port): port_opt = ' -p {} '.format(p) dev_opt = ' -d {} '.format(tc.devices[server_idx]) gid_opt = ' -x {} '.format(tc.gid[server_idx]) cmd = tc.iterators.command cmd += dev_opt + iter_opt + gid_opt cmd += size_opt + mtu_opt + qp_opt cmd += cm_opt + transport_opt + misc_opt + port_opt + bidir_opt + rxdepth_opt + txdepth_opt + atomic_opt + bw_opt # add numsges_opt only for Naples if w1.IsNaples(): cmd += numsges_opt api.Trigger_AddCommand(req, w1.node_name, w1.workload_name, tc.ib_prefix[server_idx] + cmd, background=True, timeout=120) # On Naples-Mellanox setups, with Mellanox as server, it takes a few seconds before the server # starts listening. So sleep for a few seconds before trying to start the client cmd = 'sleep 2' api.Trigger_AddCommand(req, w2.node_name, w2.workload_name, cmd) #============================================================== # cmd for client #============================================================== for p in range(s_port, e_port): port_opt = ' -p {} '.format(p) dev_opt = ' -d {} '.format(tc.devices[client_idx]) gid_opt = ' -x {} '.format(tc.gid[client_idx]) cmd = tc.iterators.command cmd += dev_opt + iter_opt + gid_opt cmd += size_opt + mtu_opt + qp_opt cmd += cm_opt + transport_opt + misc_opt + port_opt + bidir_opt + rxdepth_opt + txdepth_opt + atomic_opt # add numsges_opt only for Naples if w2.IsNaples(): cmd += numsges_opt + sq_drain_opt + async_event_stats_opt # append server's ip_address cmd += w1.ip_address api.Trigger_AddCommand( req, w2.node_name, w2.workload_name, tc.ib_prefix[client_idx] + cmd, background=tc.client_bkg, timeout=125) #5 secs more than def test timeout=120 #Do the port flap only for duration tests if hasattr(tc.iterators, 'duration') and port_flap == True: num_flaps = int(getattr(tc.iterators, 'duration')) // 20 num_flaps = num_flaps - 2 #Reduce the number of flaps so that we don't flap during connection close export_path_cmd = "export PATH=$PATH:/platform/bin:/nic/bin:/platform/tools:/nic/tools" export_ld_path_cmd = "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/platform/lib:/nic/lib" port_down_cmd = "/nic/bin/halctl debug port --port 1 --admin-state down" port_up_cmd = "/nic/bin/halctl debug port --port 1 --admin-state up" #Sleep for 10 to make sure that we don't flap during connection create cmd = 'sleep 10' api.Trigger_AddCommand(req, w1.node_name, w1.workload_name, cmd, timeout=20) for i in range(num_flaps): api.Trigger_AddNaplesCommand(req, w1.node_name, export_path_cmd) api.Trigger_AddNaplesCommand(req, w2.node_name, export_path_cmd) api.Trigger_AddNaplesCommand(req, w1.node_name, export_ld_path_cmd) api.Trigger_AddNaplesCommand(req, w2.node_name, export_ld_path_cmd) api.Trigger_AddNaplesCommand(req, w1.node_name, port_down_cmd) api.Trigger_AddNaplesCommand(req, w2.node_name, port_down_cmd) api.Trigger_AddNaplesCommand(req, w2.node_name, "sleep 1") api.Trigger_AddNaplesCommand(req, w1.node_name, port_up_cmd) api.Trigger_AddNaplesCommand(req, w2.node_name, port_up_cmd) api.Trigger_AddNaplesCommand(req, w2.node_name, "sleep 20") #Sleep to let the tests complete before Terminating cmd = 'sleep 30' api.Trigger_AddCommand(req, w1.node_name, w1.workload_name, cmd, timeout=40) if tc.client_bkg and port_flap == False: # since the client is running in the background, sleep for 30 secs # to allow the test to complete before verifying the result # override default timeout to 35, slightly above the sleep duration 30 secs cmd = 'sleep ' + str(bkg_timeout) api.Trigger_AddCommand(req, w1.node_name, w1.workload_name, cmd, timeout=(bkg_timeout + 5)) # try to kill lingering processes for w in [w1, w2]: if not w.IsNaples(): continue cmd = 'killall ' + tc.iterators.command api.Trigger_AddCommand(req, w.node_name, w.workload_name, cmd, timeout=(bkg_timeout + 5)) # print the next_qpid for w in [w1, w2]: if not w.IsNaples(): continue if tc.os == host.OS_TYPE_BSD: cmd = 'sysctl dev.' + host.GetNaplesSysctl( w.interface) + '.rdma.info.next_qpid' elif tc.os == host.OS_TYPE_LINUX: pci = host.GetNaplesPci(w.node_name, w.interface) if pci is None: continue cmd = 'grep next_qpid /sys/kernel/debug/ionic/' + pci + '/lif0/rdma/info' else: continue api.Trigger_AddCommand(req, w.node_name, w.workload_name, cmd, timeout=(bkg_timeout + 5)) if tc.tcpdump == True: api.Trigger_AddCommand(req, w1.node_name, w1.workload_name, "sleep 5") tshark_cmd = "sudo tshark -r rdma_capture.pcap -T fields -e ip.addr -e infiniband.bth.opcode -e infiniband.aeth.msn" for w in [w1, w2]: if not w.IsNaples(): continue api.Trigger_AddCommand(req, w.node_name, w.workload_name, "sudo killall tcpdump") api.Trigger_AddCommand(req, w.node_name, w.workload_name, tshark_cmd, timeout=60) #if dcqcn was enabled, disable it at the end of the test if enable_dcqcn == True: for w in [w1, w2]: if not w.IsNaples(): continue if tc.os == host.OS_TYPE_BSD: cmd = 'sysctl sys.class.infiniband.' + host.GetNaplesSysClassSysctl( w.interface) + '.dcqcn.match_default="0"' elif tc.os == host.OS_TYPE_LINUX: cmd = 'echo 0 > /sys/class/infiniband/' + host.GetNaplesSysClassSysctl( w.interface) + '/dcqcn/match_default' else: continue api.Trigger_AddCommand(req, w.node_name, w.workload_name, cmd, timeout=120) #============================================================== # trigger the request #============================================================== trig_resp = api.Trigger(req) term_resp = api.Trigger_TerminateAllCommands(trig_resp) tc.resp = api.Trigger_AggregateCommandsResponse(trig_resp, term_resp) return api.types.status.SUCCESS