def set_mux_side(tbinfo, mux_server_url, side): if 'dualtor' in tbinfo['topo']['name']: res = requests.post(mux_server_url, json={"active_side": side}) pt_assert(res.status_code == 200, 'Failed to set active side: {}'.format(res.text)) return res.json() # Response is new mux_status of all mux Y-cables. return {}
def update_mux_configs_and_config_reload(dut, state): """ @summary: Update config_db.json, and then load with 'config reload' Please note that this is a general method, and caller must backup config_db.json and do a restore at the end. @param dut: The DUT we are testing against @param state: A str, auto|active|standby """ STATE_LIST = ['auto', 'active', 'standby'] pt_assert(state in STATE_LIST, "state should be one of {}".format(STATE_LIST)) mux_cable_config = dut.shell( "sonic-cfggen -d --var-json 'MUX_CABLE'")['stdout'] pt_assert( len(mux_cable_config.strip()) != 0, "No mux_cable configuration is found in config_db") # Update mux_cable state and dump to a temp file mux_cable_config_json = json.loads(mux_cable_config) for _, config in mux_cable_config_json.items(): config['state'] = state mux_cable_config_json = {"MUX_CABLE": mux_cable_config_json} TMP_FILE = "/tmp/mux_config.json" with open(TMP_FILE, "w") as f: json.dump(mux_cable_config_json, f) dut.copy(src=TMP_FILE, dest=TMP_FILE) # Load updated mux_cable config with sonic-cfggen cmds = ["sonic-cfggen -j {} -w".format(TMP_FILE), "config save -y"] dut.shell_cmds(cmds=cmds) config_reload(dut) dut.file(path=TMP_FILE, state='absent')
def setup_ptf_base(self, input_list=None): if input_list == None: input_list = self.vtep_param_list for item in input_list: ifname = "eth{}".format(item.if_index) ip_ptf = item.ip_ptf as_number = item.as_number_ptf gobgp_port = item.gobgp_port self.ptf_helper.copy_gobgp_config(as_number=as_number, ip=str(ip_ptf.ip)) self.ptf_helper.set_ip(ifname=ifname, ip_mask=str(ip_ptf)) self.gobgp_helper.start(as_number=as_number, gobgp_port=gobgp_port) try: gobgp_port_list = [item.gobgp_port for item in input_list] pt_assert( wait_until(30, 5, self.gobgp_helper.check_gobgpd_present_status, gobgp_port_list)) for item in input_list: ip_neighbor = item.ip_dut gobgp_port = item.gobgp_port self.gobgp_helper.add_neighbor(neighbor_ip=str(ip_neighbor.ip), gobgp_port=gobgp_port) pt_assert( wait_until(100, 10, self.gobgp_helper.check_gobgpd_neighbor, gobgp_port_list), "gobgp neighbor cannot establish") except Exception as e: self.teardown_ptf_base() pytest.fail(e)
def _services_env_stop_check(duthost): """ Checks if services that impact sai-test have been stopped. Args: duthost (SonicHost): The target device. """ running_services = [] def ready_for_sai_test(): running_services = [] for service in SERVICES_LIST: if _is_container_running(duthost, service): running_services.append(service) logger.info( "Docker {} is still running, try to stop it.".format( service)) duthost.shell("docker stop {}".format(service)) if running_services: return False return True shutdown_check = wait_until(20, 4, 0, ready_for_sai_test) if running_services: format_list = ['{:>1}' for item in running_services] servers = ','.join(format_list) pt_assert( shutdown_check, "Docker {} failed to shut down in 20s".format( servers.format(*running_services)))
def start_sai_test_conatiner_with_retry(duthost, container_name): """ Attempts to start a sai test container with retry. Args: duthost (SonicHost): The target device. container_name: The container name for sai testing on DUT. """ dut_ip = duthost.host.options['inventory_manager'].get_host( duthost.hostname).vars['ansible_host'] logger.info("Checking the PRC connection before starting the {}.".format( container_name)) rpc_ready = wait_until(1, 1, 0, _is_rpc_server_ready, dut_ip) if not rpc_ready: logger.info("Attempting to start {}.".format(container_name)) sai_ready = wait_until(SAI_TEST_CTNR_CHECK_TIMEOUT_IN_SEC, SAI_TEST_CTNR_RESTART_INTERVAL_IN_SEC, 0, _is_sai_test_container_restarted, duthost, container_name) pt_assert( sai_ready, "[{}] sai test container failed to start in {}s".format( container_name, SAI_TEST_CTNR_CHECK_TIMEOUT_IN_SEC)) logger.info( "Waiting for another {} second for sai test container warm up.". format(SAI_TEST_CONTAINER_WARM_UP_IN_SEC)) time.sleep(SAI_TEST_CONTAINER_WARM_UP_IN_SEC) logger.info("Successful in starting {} at : {}:{}".format( container_name, dut_ip, SAI_PRC_PORT)) else: logger.info( "PRC connection already set up before starting the {}.".format( container_name))
def bgpmon_setup_teardown(ptfhost, duthost, localhost, setup_interfaces): connection = setup_interfaces[0] dut_lo_addr = connection["loopback_ip"].split("/")[0] peer_addr = connection['neighbor_addr'].split("/")[0] mg_facts = duthost.minigraph_facts(host=duthost.hostname)['ansible_facts'] asn = mg_facts['minigraph_bgp_asn'] # TODO: Add a common method to load BGPMON config for test_bgpmon and test_traffic_shift logger.info("Configuring bgp monitor session on DUT") bgpmon_args = { 'db_table_name': 'BGP_MONITORS', 'peer_addr': peer_addr, 'asn': asn, 'local_addr': dut_lo_addr, 'peer_name': BGP_MONITOR_NAME } bgpmon_template = Template(open(BGPMON_TEMPLATE_FILE).read()) duthost.copy(content=bgpmon_template.render(**bgpmon_args), dest=BGPMON_CONFIG_FILE) # Start bgpmon on DUT logger.info("Starting bgpmon on DUT") duthost.command("sonic-cfggen -j {} -w".format(BGPMON_CONFIG_FILE)) logger.info("Starting bgp monitor session on PTF") ptfhost.file(path=DUMP_FILE, state="absent") ptfhost.copy(src=CUSTOM_DUMP_SCRIPT, dest=CUSTOM_DUMP_SCRIPT_DEST) ptfhost.exabgp(name=BGP_MONITOR_NAME, state="started", local_ip=peer_addr, router_id=peer_addr, peer_ip=dut_lo_addr, local_asn=asn, peer_asn=asn, port=BGP_MONITOR_PORT, dump_script=CUSTOM_DUMP_SCRIPT_DEST) # Add the route to DUT loopback IP and the interface router mac ptfhost.shell("ip neigh add %s lladdr %s dev %s" % (dut_lo_addr, duthost.facts["router_mac"], connection["neighbor_intf"])) ptfhost.shell("ip route add %s dev %s" % (dut_lo_addr + "/32", connection["neighbor_intf"])) pt_assert( wait_tcp_connection(localhost, ptfhost.mgmt_ip, BGP_MONITOR_PORT), "Failed to start bgp monitor session on PTF") yield # Cleanup bgp monitor duthost.shell("redis-cli -n 4 -c DEL 'BGP_MONITORS|{}'".format(peer_addr)) ptfhost.exabgp(name=BGP_MONITOR_NAME, state="absent") ptfhost.file(path=CUSTOM_DUMP_SCRIPT_DEST, state="absent") ptfhost.file(path=DUMP_FILE, state="absent") # Remove the route to DUT loopback IP and the interface router mac ptfhost.shell("ip route del %s dev %s" % (dut_lo_addr + "/32", connection["neighbor_intf"])) ptfhost.shell("ip neigh del %s lladdr %s dev %s" % (dut_lo_addr, duthost.facts["router_mac"], connection["neighbor_intf"]))
def test_standby_tor_upstream_mux_toggle(rand_selected_dut, tbinfo, ptfadapter, rand_selected_interface, require_mocked_dualtor, toggle_all_simulator_ports, set_crm_polling_interval): itfs, ip = rand_selected_interface PKT_NUM = 100 # Step 1. Set mux state to standby and verify traffic is dropped by ACL rule and drop counters incremented set_mux_state(rand_selected_dut, tbinfo, 'standby', [itfs], toggle_all_simulator_ports) # Wait sometime for mux toggle time.sleep(PAUSE_TIME) crm_facts0 = rand_selected_dut.get_crm_facts() # Verify packets are not go up verify_upstream_traffic(host=rand_selected_dut, ptfadapter=ptfadapter, tbinfo=tbinfo, itfs=itfs, server_ip=ip['server_ipv4'].split('/')[0], pkt_num=PKT_NUM, drop=True) time.sleep(5) # Step 2. Toggle mux state to active, and verify traffic is not dropped by ACL and fwd-ed to uplinks; verify CRM show and no nexthop objects are stale set_mux_state(rand_selected_dut, tbinfo, 'active', [itfs], toggle_all_simulator_ports) # Wait sometime for mux toggle time.sleep(PAUSE_TIME) # Verify packets are not go up verify_upstream_traffic(host=rand_selected_dut, ptfadapter=ptfadapter, tbinfo=tbinfo, itfs=itfs, server_ip=ip['server_ipv4'].split('/')[0], pkt_num=PKT_NUM, drop=False) # Step 3. Toggle mux state to standby, and verify traffic is dropped by ACL; verify CRM show and no nexthop objects are stale set_mux_state(rand_selected_dut, tbinfo, 'standby', [itfs], toggle_all_simulator_ports) # Wait sometime for mux toggle time.sleep(PAUSE_TIME) # Verify packets are not go up again verify_upstream_traffic(host=rand_selected_dut, ptfadapter=ptfadapter, tbinfo=tbinfo, itfs=itfs, server_ip=ip['server_ipv4'].split('/')[0], pkt_num=PKT_NUM, drop=True) crm_facts1 = rand_selected_dut.get_crm_facts() unmatched_crm_facts = compare_crm_facts(crm_facts0, crm_facts1) pt_assert( len(unmatched_crm_facts) == 0, 'Unmatched CRM facts: {}'.format( json.dumps(unmatched_crm_facts, indent=4)))
def ptf_test_port_map(ptfhost, tbinfo, duthosts, mux_server_url): active_dut_map = {} if 'dualtor' in tbinfo['topo']['name']: res = requests.get(mux_server_url) pt_assert(res.status_code == 200, 'Failed to get mux status: {}'.format(res.text)) for mux_status in res.json().values(): active_dut_index = 0 if mux_status[ 'active_side'] == 'upper_tor' else 1 active_dut_map[str(mux_status['port_index'])] = active_dut_index disabled_ptf_ports = set() for ptf_map in tbinfo['topo']['ptf_map_disabled'].values(): # Loop ptf_map of each DUT. Each ptf_map maps from ptf port index to dut port index disabled_ptf_ports = disabled_ptf_ports.union(set(ptf_map.keys())) router_macs = [duthost.facts['router_mac'] for duthost in duthosts] logger.info('active_dut_map={}'.format(active_dut_map)) logger.info('disabled_ptf_ports={}'.format(disabled_ptf_ports)) logger.info('router_macs={}'.format(router_macs)) ports_map = {} for ptf_port, dut_intf_map in tbinfo['topo']['ptf_dut_intf_map'].items(): if str(ptf_port) in disabled_ptf_ports: # Skip PTF ports that are connected to disabled VLAN interfaces continue if len(dut_intf_map.keys()) == 2: # PTF port is mapped to two DUTs -> dualtor topology and the PTF port is a vlan port # Packet sent from this ptf port will only be accepted by the active side DUT # DualToR DUTs use same special Vlan interface MAC address target_dut_index = int(active_dut_map[ptf_port]) ports_map[ptf_port] = { 'target_dut': target_dut_index, 'target_mac': tbinfo['topo']['properties']['topology']['DUT']['vlan_configs'] ['one_vlan_a']['Vlan1000']['mac'] } else: # PTF port is mapped to single DUT target_dut_index = int(dut_intf_map.keys()[0]) ports_map[ptf_port] = { 'target_dut': target_dut_index, 'target_mac': router_macs[target_dut_index] } logger.debug('ptf_test_port_map={}'.format(json.dumps(ports_map, indent=2))) ptfhost.copy(content=json.dumps(ports_map), dest=PTF_TEST_PORT_MAP) return PTF_TEST_PORT_MAP
def check_results(results): """Helper function for checking results of parallel run. Args: results (Proxy to shared dict): Results of parallel run, indexed by node name. """ failed_results = {} for node_name, node_results in results.items(): failed_node_results = [res for res in node_results if res['failed']] if len(failed_node_results) > 0: failed_results[node_name] = failed_node_results if failed_results: logger.error('failed_results => {}'.format(json.dumps(failed_results, indent=2))) pt_assert(False, 'Some processes for updating nbr hosts configuration returned failed results')
def teardown_dut_base(self, input_list=None): if input_list == None: input_list = self.vtep_param_list for item in input_list: self.frr_helper.unset_neighbor(neighbor_ip=str(item.ip_ptf.ip), as_number=item.as_number_ptf) self.dut_helper.unset_ip(iface=item.if_index, ip_mask=str(item.ip_dut)) self.frr_helper.unset_advertise_all_vni() self.dut_helper.del_vxlan(vlanid="1000", vni="10000") pt_assert( wait_until(10, 2, self.check_interface_status, self.vtep_if, False))
def _get_tor_fanouthosts(tor_host, fanouthosts): """Helper function to get the fanout host objects that the current tor_host connected to. Args: tor_host (object): Host object for the ToR DUT. fanouthosts (dict): Key is fanout hostname, value is fanout host object. Returns: dict: Key is fanout hostname, value is fanout host object. """ hosts = {} for fanout_hostname, fanout_host in fanouthosts.items(): if tor_host.hostname in fanout_host.dut_hostnames: hosts[fanout_hostname] = fanout_host if not hosts: pt_assert('Failed to get fanout for tor_host "{}"'.format(tor_host.hostname)) return hosts
def force_terminate(workers): # Some processes cannot be terminated. Try to kill them and raise flag. running_processes = [worker for worker in workers if worker.is_alive()] if len(running_processes) > 0: logger.info('Found processes still running: {}. Try to kill them.'. format( #lgtm [py/clear-text-logging-sensitive-data] str(running_processes))) for p in running_processes: results[p.name] = [{'failed': True}] try: os.kill(p.pid, signal.SIGKILL) except OSError as err: logger.error("Unable to kill {}:{}, error:{}".format( p.pid, p.name, err)) pt_assert( False, """Processes running target "{}" could not be terminated. Unable to kill {}:{}, error:{}""".format( target.__name__, p.pid, p.name, err))
def setup_dut_base(self, input_list=None): if input_list == None: input_list = self.vtep_param_list try: self.dut_helper.add_vxlan(vtep_ip=DUT_VTEP_IP, vlanid="1000", vni="10000") pt_assert( wait_until(10, 2, self.check_interface_status, self.vtep_if, True)) for item in input_list: self.dut_helper.set_ip(iface=item.if_index, ip_mask=str(item.ip_dut)) self.frr_helper.set_neighbor(neighbor_ip=str(item.ip_ptf.ip), as_number=item.as_number_ptf) self.frr_helper.set_advertise_all_vni() except Exception as e: self.teardown_dut_base() pytest.fail(e)
def verify_decap_receive_packet(self, send_port, access_port_list, pkt_send, pkt_expected): test = self.outer.ptfadapter hit_map = {} for each in access_port_list: hit_map[each] = 0 for i in range(0, NUM_CONTINUOUS_PKT_COUNT): pkt_send['UDP'].sport = pkt_send['UDP'].sport + 1 test.dataplane.flush() testutils.send(test, send_port, pkt_send) logging.debug("send packet #{}".format(i)) index, _ = testutils.verify_packet_any_port( test, pkt_expected, access_port_list) hit_map[access_port_list[index]] += 1 logging.debug("Received in port index: {}".format( access_port_list[index])) # check whether each port receives at lease one packet for each in hit_map: pt_assert(hit_map[each] > 0) # check all sended packet is all received pt_assert(sum(hit_map.values()) == NUM_CONTINUOUS_PKT_COUNT)
def check_nexthops_balance(rand_selected_dut, ptfadapter, dst_server_ipv4, tbinfo, downlink_ints, nexthops_count): HASH_KEYS = ["src-port", "dst-port", "src-ip"] # expect this packet to be sent to downlinks (active mux) and uplink (stanby mux) expected_downlink_ports = [ get_ptf_server_intf_index(rand_selected_dut, tbinfo, iface) for iface in downlink_ints ] expected_uplink_ports = list() for members in get_t1_ptf_pc_ports(rand_selected_dut, tbinfo).values(): for member in members: expected_uplink_ports.append(int(member.strip("eth"))) logging.info("Expecting packets in downlink ports {}".format( expected_downlink_ports)) logging.info( "Expecting packets in uplink ports {}".format(expected_uplink_ports)) ptf_t1_intf = random.choice(get_t1_ptf_ports(rand_selected_dut, tbinfo)) port_packet_count = dict() for _ in range(10000): send_packet, exp_pkt, exp_tunnel_pkt = generate_hashed_packet_to_server( ptfadapter, rand_selected_dut, HASH_KEYS, dst_server_ipv4) testutils.send(ptfadapter, int(ptf_t1_intf.strip("eth")), send_packet, count=1) # expect ECMP hashing to work and distribute downlink traffic evenly to every nexthop all_allowed_ports = expected_downlink_ports + expected_uplink_ports ptf_port_count = count_matched_packets_all_ports( ptfadapter, exp_packet=exp_pkt, exp_tunnel_pkt=exp_tunnel_pkt, ports=all_allowed_ports, timeout=0.1, count=1) for ptf_idx, pkt_count in ptf_port_count.items(): port_packet_count[ptf_idx] = port_packet_count.get(ptf_idx, 0) + pkt_count logging.info("Received packets in ports: {}".format( str(port_packet_count))) expect_packet_num = 10000 // nexthops_count for downlink_int in expected_downlink_ports: # ECMP validation: pkt_num_lo = expect_packet_num * (1.0 - 0.25) pkt_num_hi = expect_packet_num * (1.0 + 0.25) count = port_packet_count.get(downlink_int, 0) logging.info("Packets received on downlink port {}: {}".format( downlink_int, count)) if count < pkt_num_lo or count > pkt_num_hi: balance = False pt_assert( balance, "Packets not evenly distributed on downlink port {}".format( downlink_int)) if len(downlink_ints) < nexthops_count: # Some nexthop is now connected to standby mux, and the packets will be sent towards portchanel ints # Hierarchical ECMP validation (in case of standby MUXs): # Step 1: Calculate total uplink share. total_uplink_share = expect_packet_num * (nexthops_count - len(expected_downlink_ports)) # Step 2: Divide uplink share among all uplinks expect_packet_num = total_uplink_share // len(expected_uplink_ports) # Step 3: Check if uplink distribution (hierarchical ECMP) is balanced for uplink_int in expected_uplink_ports: pkt_num_lo = expect_packet_num * (1.0 - 0.25) pkt_num_hi = expect_packet_num * (1.0 + 0.25) count = port_packet_count.get(uplink_int, 0) logging.info("Packets received on uplink port {}: {}".format( uplink_int, count)) if count < pkt_num_lo or count > pkt_num_hi: balance = False pt_assert( balance, "Hierarchical ECMP failed: packets not evenly distributed on uplink port {}" .format(uplink_int))
def test_vrf_vni_map_configuration(self, duthost, vrf_vni_map_set): # vrf res = duthost.shell("redis-cli -n 4 -c hgetall 'VRF|Vrf1'") res_list = res['stdout_lines'] pt_assert('vni' in res_list) pt_assert('10000' in res_list) res = duthost.shell("redis-cli -n 0 -c hgetall 'VRF_TABLE:Vrf1'") res_list = res['stdout_lines'] pt_assert('vni' in res_list) pt_assert('10000' in res_list) res = duthost.shell("redis-cli -n 0 -c hgetall 'VXLAN_VRF_TABLE:vtep:evpn_map_10000_Vrf1'") res_list = res['stdout_lines'] pt_assert('10000' in res_list) pt_assert('Vrf1' in res_list)
def test_vlan_vni_map_configuration(self, duthost, setup_dut): # vtep res = duthost.shell("redis-cli -n 4 -c hgetall 'VXLAN_TUNNEL|vtep'") res_list = res['stdout_lines'] pt_assert(res_list[0] == 'src_ip') pt_assert(res_list[1] == DUT_VTEP_IP) res = duthost.shell("redis-cli -n 0 -c hgetall 'VXLAN_TUNNEL_TABLE:vtep'") res_list = res['stdout_lines'] pt_assert(res_list[0] == 'src_ip') pt_assert(res_list[1] == DUT_VTEP_IP) # evpnnvo res = duthost.shell("redis-cli -n 4 -c hgetall 'VXLAN_EVPN_NVO|evpnnvo1'") res_list = res['stdout_lines'] pt_assert(res_list[0] == 'source_vtep') pt_assert(res_list[1] == 'vtep') res = duthost.shell("redis-cli -n 0 -c hgetall 'VXLAN_EVPN_NVO_TABLE:evpnnvo1'") res_list = res['stdout_lines'] pt_assert(res_list[0] == 'source_vtep') pt_assert(res_list[1] == 'vtep') # map res = duthost.shell("redis-cli -n 4 -c hgetall 'VXLAN_TUNNEL_MAP|vtep|map_10000_Vlan1000'") res_list = res['stdout_lines'] pt_assert(res_list[1] == '10000') pt_assert(res_list[3] == 'Vlan1000') res = duthost.shell("redis-cli -n 0 -c hgetall 'VXLAN_TUNNEL_MAP_TABLE:vtep:map_10000_Vlan1000'") res_list = res['stdout_lines'] logging.info(res_list) pt_assert(res_list[1] == '10000') pt_assert(res_list[3] == 'Vlan1000')
def parallel_run(target, args, kwargs, nodes, timeout=None): """Run target function on nodes in parallel Args: target (function): The target function to be executed in parallel. args (list of tuple): List of arguments for the target function. kwargs (dict): Keyword arguments for the target function. It will be extended with two keys: 'node' and 'results'. The 'node' key will hold an item of the nodes list. The 'result' key will hold an instance of multiprocessing.Manager().dict(). It is a proxy of the shared dict that will be used by each process for returning execution results. nodes (list of nodes): List of nodes to be used by the target function timeout (int or float, optional): Total time allowed for the spawned multiple processes to run. Defaults to None. When timeout is specified, this function will wait at most 'timeout' seconds for the processes to run. When time is up, this function will try to terminate or even kill all the processes. Raises: flag.: In case any of the spawned process cannot be terminated, fail the test. Returns: dict: An instance of multiprocessing.Manager().dict(). It is a proxy to the shared dict that is used by all the spawned processes. """ workers = [] results = Manager().dict() start_time = datetime.datetime.now() for node in nodes: kwargs['node'] = node kwargs['results'] = results process_name = "{}--{}".format(target.__name__, node) worker = SonicProcess(name=process_name, target=target, args=args, kwargs=kwargs) worker.start() logger.debug('Started process {} running target "{}"'.format( worker.pid, process_name)) workers.append(worker) for worker in workers: logger.debug( 'Wait for process "{}" with pid "{}" to complete, timeout={}'. format(worker.name, worker.pid, timeout)) worker.join(timeout) logger.debug('Process "{}" with pid "{}" completed'.format( worker.name, worker.pid)) # If execution time of processes exceeds timeout, need to force terminate them all. if timeout is not None: if (datetime.datetime.now() - start_time).seconds > timeout: logger.error( 'Process execution time exceeds {} seconds.'.format( str(timeout))) break # check if we have any processes that failed - have exitcode non-zero failed_processes = {} for worker in workers: if worker.exitcode != 0: failed_processes[worker.name] = {} failed_processes[worker.name]['exit_code'] = worker.exitcode failed_processes[worker.name]['exception'] = worker.exception # Force terminate spawned processes for worker in workers: if worker.is_alive(): logger.error( 'Process {} with pid {} is still alive, try to force terminate it.' .format(worker.name, worker.pid)) worker.terminate() end_time = datetime.datetime.now() delta_time = end_time - start_time # Some processes cannot be terminated. Try to kill them and raise flag. running_processes = [worker for worker in workers if worker.is_alive()] if len(running_processes) > 0: logger.error( 'Found processes still running: {}. Try to kill them.'.format( str(running_processes))) for p in running_processes: try: os.kill(p.pid, signal.SIGKILL) except OSError: pass pt_assert(False, \ 'Processes running target "{}" could not be terminated. Tried killing them. But please check'.format(target.__name__)) # if we have failed processes, we should log the exception and exit code of each Process and fail if len(failed_processes.keys()): for process_name, process in failed_processes.items(): p_exception = process['exception'][0] p_traceback = process['exception'][1] p_exitcode = process['exit_code'] logger.error( 'Process {} had exit code {} and exception {} and traceback {}' .format(process_name, p_exitcode, p_exception, p_traceback)) pt_assert( False, 'Processes "{}" had failures. Please check the logs'.format( failed_processes.keys())) logger.info( 'Completed running processes for target "{}" in {} seconds'.format( target.__name__, str(delta_time))) return results
def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo): logger.info("Start pre-test sanity check") skip_sanity = False allow_recover = False recover_method = "adaptive" check_items = set(copy.deepcopy( constants.DEFAULT_CHECK_ITEMS)) # Default check items post_check = False customized_sanity_check = None for m in request.node.iter_markers(): logger.info("Found marker: m.name=%s, m.args=%s, m.kwargs=%s" % (m.name, m.args, m.kwargs)) if m.name == "sanity_check": customized_sanity_check = m break if customized_sanity_check: logger.info("Process marker %s in script. m.args=%s, m.kwargs=%s" % (m.name, str(m.args), str(m.kwargs))) skip_sanity = customized_sanity_check.kwargs.get("skip_sanity", False) allow_recover = customized_sanity_check.kwargs.get( "allow_recover", False) recover_method = customized_sanity_check.kwargs.get( "recover_method", "adaptive") if allow_recover and recover_method not in constants.RECOVER_METHODS: pytest.warning("Unsupported recover method") logger.info( "Fall back to use default recover method 'config_reload'") recover_method = "config_reload" check_items = _update_check_items( check_items, customized_sanity_check.kwargs.get("check_items", []), constants.SUPPORTED_CHECK_ITEMS) post_check = customized_sanity_check.kwargs.get("post_check", False) if request.config.option.skip_sanity: skip_sanity = True if request.config.option.allow_recover: allow_recover = True items = request.config.getoption("--check_items") if items: items_array = str(items).split(',') check_items = _update_check_items(check_items, items_array, constants.SUPPORTED_CHECK_ITEMS) # ignore BGP check for particular topology type if tbinfo['topo']['type'] == 'ptf' and 'bgp' in check_items: check_items.remove('bgp') logger.info("Sanity check settings: skip_sanity=%s, check_items=%s, allow_recover=%s, recover_method=%s, post_check=%s" % \ (skip_sanity, check_items, allow_recover, recover_method, post_check)) if skip_sanity: logger.info( "Skip sanity check according to command line argument or configuration of test script." ) yield return if not check_items: logger.info( "No sanity check item is specified, no pre-test sanity check") yield logger.info( "No sanity check item is specified, no post-test sanity check") return print_logs(duthosts, constants.PRINT_LOGS) check_results = do_checks(duthosts, check_items) logger.info("!!!!!!!!!!!!!!!! Pre-test sanity check results: !!!!!!!!!!!!!!!!\n%s" % \ json.dumps(check_results, indent=4)) pre_sanity_failed = False for a_dutname, a_dut_results in check_results.items(): if any([result["failed"] for result in a_dut_results]): pre_sanity_failed = True if not allow_recover: failed_items = json.dumps( [result for result in a_dut_results if result["failed"]], indent=4) logger.error( "On {}, failed pre-sanity check items with allow_recover=False:\n{}" .format(a_dutname, failed_items)) else: logger.info( "Pre-test sanity check failed on %s, try to recover, recover_method=%s" % (a_dutname, recover_method)) recover(duthosts[a_dutname], localhost, fanouthosts, a_dut_results, recover_method) pt_assert( allow_recover or not pre_sanity_failed, "Pre-test sanity check failed on DUTs, allow_recover=False:{}".format( check_results)) if allow_recover and pre_sanity_failed: logger.info("Run sanity check again after recovery") new_check_results = do_checks(duthosts, check_items) logger.info("!!!!!!!!!!!!!!!! Pre-test sanity check after recovery results: !!!!!!!!!!!!!!!!\n%s" % \ json.dumps(new_check_results, indent=4)) pre_sanity_failed_after_recover = False for a_dutname, a_dut_new_results in new_check_results.items(): if any([result["failed"] for result in a_dut_new_results]): pre_sanity_failed_after_recover = True failed_items = json.dumps([ result for result in a_dut_new_results if result["failed"] ], indent=4) logger.error( "On {}, failed check items after recover:\n{}".format( a_dutname, failed_items)) pt_assert( not pre_sanity_failed_after_recover, "Pre-test sanity check failed on DUTs after recover:\n{}".format( new_check_results)) logger.info("Done pre-test sanity check") yield logger.info("Start post-test sanity check") if not post_check: logger.info( "No post-test check is required. Done post-test sanity check") return post_check_results = do_checks(duthosts, check_items) logger.info("!!!!!!!!!!!!!!!! Post-test sanity check results: !!!!!!!!!!!!!!!!\n%s" % \ json.dumps(post_check_results, indent=4)) post_sanity_failed = False for a_dutname, a_dut_post_results in post_check_results.items(): if any([result["failed"] for result in a_dut_post_results]): post_sanity_failed = True failed_items = json.dumps( [result for result in a_dut_new_results if result["failed"]], indent=4) logger.error("On {}, failed check items after recover:\n{}".format( a_dutname, failed_items)) pt_assert( not post_sanity_failed, "Post-test sanity check failed on DUTs after recover:\n{}".format( post_check_results)) logger.info("Done post-test sanity check") return
def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo): logger.info("Prepare pre-test sanity check") skip_sanity = False allow_recover = False recover_method = "adaptive" check_items = set(copy.deepcopy(SUPPORTED_CHECKS)) # Default check items post_check = False customized_sanity_check = None for m in request.node.iter_markers(): logger.info("Found marker: m.name=%s, m.args=%s, m.kwargs=%s" % (m.name, m.args, m.kwargs)) if m.name == "sanity_check": customized_sanity_check = m break if customized_sanity_check: logger.info( "Process marker {} in script. m.args={}, m.kwargs={}".format( customized_sanity_check.name, customized_sanity_check.args, customized_sanity_check.kwargs)) skip_sanity = customized_sanity_check.kwargs.get("skip_sanity", False) allow_recover = customized_sanity_check.kwargs.get( "allow_recover", False) recover_method = customized_sanity_check.kwargs.get( "recover_method", "adaptive") if allow_recover and recover_method not in constants.RECOVER_METHODS: pytest.warning("Unsupported recover method") logger.info( "Fall back to use default recover method 'config_reload'") recover_method = "config_reload" check_items = _update_check_items( check_items, customized_sanity_check.kwargs.get("check_items", []), SUPPORTED_CHECKS) post_check = customized_sanity_check.kwargs.get("post_check", False) if request.config.option.skip_sanity: skip_sanity = True if skip_sanity: logger.info( "Skip sanity check according to command line argument or configuration of test script." ) yield return if request.config.option.allow_recover: allow_recover = True cli_items = request.config.getoption("--check_items") if cli_items: cli_items_list = str(cli_items).split(',') check_items = _update_check_items(check_items, cli_items_list, SUPPORTED_CHECKS) # ignore BGP check for particular topology type if tbinfo['topo']['type'] == 'ptf' and 'bgp' in check_items: check_items.remove('bgp') if 'dualtor' not in tbinfo['topo']['name']: check_items.remove('mux_simulator') logger.info("Sanity check settings: skip_sanity=%s, check_items=%s, allow_recover=%s, recover_method=%s, post_check=%s" % \ (skip_sanity, check_items, allow_recover, recover_method, post_check)) if not check_items: logger.info( "No sanity check item is specified, no pre-test sanity check") yield logger.info( "No sanity check item is specified, no post-test sanity check") return # Dynamically attach selected check fixtures to node for item in check_items: request.fixturenames.append(_item2fixture(item)) print_logs(duthosts) logger.info("Start pre-test sanity checks") check_results = do_checks(request, check_items) logger.debug("Pre-test sanity check results:\n%s" % json.dumps(check_results, indent=4)) failed_results = [result for result in check_results if result['failed']] if failed_results: if not allow_recover: pt_assert(False, "!!!!!!!!!!!!!!!!Pre-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\ .format(json.dumps(failed_results, indent=4))) else: dut_failed_results = defaultdict(list) for failed_result in failed_results: if 'host' in failed_result: dut_failed_results[failed_result['host']].append( failed_result) for dut_name, dut_results in dut_failed_results.items(): recover(duthosts[dut_name], localhost, fanouthosts, dut_results, recover_method) logger.info("Run sanity check again after recovery") new_check_results = do_checks(request, check_items) logger.debug("Pre-test sanity check after recovery results:\n%s" % json.dumps(new_check_results, indent=4)) new_failed_results = [ result for result in new_check_results if result['failed'] ] if new_failed_results: pt_assert(False, "!!!!!!!!!!!!!!!! Pre-test sanity check after recovery failed: !!!!!!!!!!!!!!!!\n{}"\ .format(json.dumps(new_failed_results, indent=4))) logger.info("Done pre-test sanity check") yield if not post_check: logger.info( "No post-test check is required. Done post-test sanity check") return logger.info("Start post-test sanity check") post_check_results = do_checks(request, check_items) logger.debug("Post-test sanity check results:\n%s" % json.dumps(post_check_results, indent=4)) post_failed_results = [ result for result in post_check_results if result['failed'] ] if post_failed_results: pt_assert(False, "!!!!!!!!!!!!!!!! Post-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\ .format(json.dumps(post_failed_results, indent=4))) logger.info("Done post-test sanity check") return
def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo): logger.info("Prepare sanity check") skip_sanity = False allow_recover = False recover_method = "adaptive" pre_check_items = set(copy.deepcopy(SUPPORTED_CHECKS)) # Default check items post_check = False customized_sanity_check = None for m in request.node.iter_markers(): logger.info("Found marker: m.name=%s, m.args=%s, m.kwargs=%s" % (m.name, m.args, m.kwargs)) if m.name == "sanity_check": customized_sanity_check = m break if customized_sanity_check: logger.info("Process marker {} in script. m.args={}, m.kwargs={}" .format(customized_sanity_check.name, customized_sanity_check.args, customized_sanity_check.kwargs)) skip_sanity = customized_sanity_check.kwargs.get("skip_sanity", False) allow_recover = customized_sanity_check.kwargs.get("allow_recover", False) recover_method = customized_sanity_check.kwargs.get("recover_method", "adaptive") if allow_recover and recover_method not in constants.RECOVER_METHODS: pytest.warning("Unsupported recover method") logger.info("Fall back to use default recover method 'config_reload'") recover_method = "config_reload" pre_check_items = _update_check_items( pre_check_items, customized_sanity_check.kwargs.get("check_items", []), SUPPORTED_CHECKS) post_check = customized_sanity_check.kwargs.get("post_check", False) if request.config.option.skip_sanity: skip_sanity = True if skip_sanity: logger.info("Skip sanity check according to command line argument or configuration of test script.") yield return if request.config.option.allow_recover: allow_recover = True if request.config.option.recover_method: recover_method = request.config.getoption("--recover_method") if request.config.option.post_check: post_check = True cli_check_items = request.config.getoption("--check_items") cli_post_check_items = request.config.getoption("--post_check_items") if cli_check_items: logger.info('Fine tune pre-test check items based on CLI option --check_items') cli_items_list=str(cli_check_items).split(',') pre_check_items = _update_check_items(pre_check_items, cli_items_list, SUPPORTED_CHECKS) pre_check_items = filter_check_items(tbinfo, pre_check_items) # Filter out un-supported checks. if post_check: # Prepare post test check items based on the collected pre test check items. post_check_items = copy.copy(pre_check_items) if customized_sanity_check: post_check_items = _update_check_items( post_check_items, customized_sanity_check.kwargs.get("post_check_items", []), SUPPORTED_CHECKS) if cli_post_check_items: logger.info('Fine tune post-test check items based on CLI option --post_check_items') cli_post_items_list = str(cli_post_check_items).split(',') post_check_items = _update_check_items(post_check_items, cli_post_items_list, SUPPORTED_CHECKS) post_check_items = filter_check_items(tbinfo, post_check_items) # Filter out un-supported checks. else: post_check_items = set() logger.info("Sanity check settings: skip_sanity=%s, pre_check_items=%s, allow_recover=%s, recover_method=%s, post_check=%s, post_check_items=%s" % \ (skip_sanity, pre_check_items, allow_recover, recover_method, post_check, post_check_items)) for item in pre_check_items.union(post_check_items): request.fixturenames.append(_item2fixture(item)) # Workaround for pytest requirement. # Each possibly used check fixture must be executed in setup phase. Otherwise there could be teardown error. request.getfixturevalue(_item2fixture(item)) if pre_check_items: logger.info("Start pre-test sanity checks") # Dynamically attach selected check fixtures to node for item in set(pre_check_items): request.fixturenames.append(_item2fixture(item)) print_logs(duthosts) check_results = do_checks(request, pre_check_items, stage=STAGE_PRE_TEST) logger.debug("Pre-test sanity check results:\n%s" % json.dumps(check_results, indent=4)) failed_results = [result for result in check_results if result['failed']] if failed_results: if not allow_recover: pt_assert(False, "!!!!!!!!!!!!!!!!Pre-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\ .format(json.dumps(failed_results, indent=4))) else: dut_failed_results = defaultdict(list) for failed_result in failed_results: if 'host' in failed_result: dut_failed_results[failed_result['host']].append(failed_result) for dut_name, dut_results in dut_failed_results.items(): recover(duthosts[dut_name], localhost, fanouthosts, dut_results, recover_method) logger.info("Run sanity check again after recovery") new_check_results = do_checks(request, pre_check_items, stage=STAGE_PRE_TEST, after_recovery=True) logger.debug("Pre-test sanity check after recovery results:\n%s" % json.dumps(new_check_results, indent=4)) new_failed_results = [result for result in new_check_results if result['failed']] if new_failed_results: pt_assert(False, "!!!!!!!!!!!!!!!! Pre-test sanity check after recovery failed: !!!!!!!!!!!!!!!!\n{}"\ .format(json.dumps(new_failed_results, indent=4))) logger.info("Done pre-test sanity check") else: logger.info('No pre-test sanity check item, skip pre-test sanity check.') yield if not post_check: logger.info("No post-test check is required. Done post-test sanity check") return if post_check_items: logger.info("Start post-test sanity check") post_check_results = do_checks(request, post_check_items, stage=STAGE_POST_TEST) logger.debug("Post-test sanity check results:\n%s" % json.dumps(post_check_results, indent=4)) post_failed_results = [result for result in post_check_results if result['failed']] if post_failed_results: pt_assert(False, "!!!!!!!!!!!!!!!! Post-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\ .format(json.dumps(post_failed_results, indent=4))) logger.info("Done post-test sanity check") else: logger.info('No post-test sanity check item, skip post-test sanity check.')
def parallel_run(target, args, kwargs, nodes_list, timeout=None, concurrent_tasks=24): """Run target function on nodes in parallel Args: target (function): The target function to be executed in parallel. args (list of tuple): List of arguments for the target function. kwargs (dict): Keyword arguments for the target function. It will be extended with two keys: 'node' and 'results'. The 'node' key will hold an item of the nodes list. The 'result' key will hold an instance of multiprocessing.Manager().dict(). It is a proxy of the shared dict that will be used by each process for returning execution results. nodes (list of nodes): List of nodes to be used by the target function timeout (int or float, optional): Total time allowed for the spawned multiple processes to run. Defaults to None. When timeout is specified, this function will wait at most 'timeout' seconds for the processes to run. When time is up, this function will try to terminate or even kill all the processes. Raises: flag.: In case any of the spawned process cannot be terminated, fail the test. Returns: dict: An instance of multiprocessing.Manager().dict(). It is a proxy to the shared dict that is used by all the spawned processes. """ nodes = [node for node in nodes_list] # Callback API for wait_procs def on_terminate(worker): logger.info("process {} terminated with exit code {}".format( worker.name, worker.returncode)) def force_terminate(workers): # Some processes cannot be terminated. Try to kill them and raise flag. running_processes = [worker for worker in workers if worker.is_alive()] if len(running_processes) > 0: logger.info('Found processes still running: {}. Try to kill them.'. format( #lgtm [py/clear-text-logging-sensitive-data] str(running_processes))) for p in running_processes: results[p.name] = [{'failed': True}] try: os.kill(p.pid, signal.SIGKILL) except OSError as err: logger.error("Unable to kill {}:{}, error:{}".format( p.pid, p.name, err)) pt_assert( False, """Processes running target "{}" could not be terminated. Unable to kill {}:{}, error:{}""".format( target.__name__, p.pid, p.name, err)) workers = [] results = Manager().dict() start_time = datetime.datetime.now() tasks_done = 0 total_tasks = len(nodes) tasks_running = 0 total_timeout = timeout * math.ceil( len(nodes) / float(concurrent_tasks)) if timeout else None failed_processes = {} while tasks_done < total_tasks: # If execution time of processes exceeds timeout, need to force # terminate them all. if total_timeout is not None: if (datetime.datetime.now() - start_time).seconds > total_timeout: logger.error( 'Process execution time exceeds {} seconds.'.format( str(total_timeout))) break while len(nodes) and tasks_running < concurrent_tasks: node = nodes.pop(0) kwargs['node'] = node kwargs['results'] = results process_name = "{}--{}".format(target.__name__, node) worker = SonicProcess(name=process_name, target=target, args=args, kwargs=kwargs) worker.start() tasks_running += 1 logger.debug('Started process {} running target "{}"'.format( worker.pid, process_name)) workers.append(worker) gone, alive = wait_procs(workers, timeout=timeout, callback=on_terminate) workers = alive logger.debug("task completed {}, running {}".format( len(gone), len(alive))) if len(gone) == 0: logger.debug("all processes have timedout") tasks_running -= len(workers) tasks_done += len(workers) force_terminate(workers) del workers[:] else: tasks_running -= len(gone) tasks_done += len(gone) # check if we have any processes that failed - have exitcode non-zero for worker in gone: if worker.exitcode != 0: failed_processes[worker.name] = {} failed_processes[worker.name]['exit_code'] = worker.exitcode failed_processes[worker.name]['exception'] = worker.exception # In case of timeout force terminate spawned processes for worker in workers: if worker.is_alive(): logger.error('Process {} is alive, force terminate it.'.format( worker.name)) worker.terminate() results[worker.name] = [{'failed': True}] end_time = datetime.datetime.now() delta_time = end_time - start_time # force terminate any workers still running force_terminate(workers) # if we have failed processes, we should log the exception and exit code # of each Process and fail if len(failed_processes.keys()): for process_name, process in failed_processes.items(): if 'exception' in process and process['exception']: p_exception = process['exception'][0] p_traceback = process['exception'][1] p_exitcode = process['exit_code'] logger.error("""Process {} had exit code {} and exception {} and traceback {}""".format(process_name, p_exitcode, p_exception, p_traceback)) pt_assert( False, 'Processes "{}" had failures. Please check the logs'.format( list(failed_processes.keys()))) logger.info( 'Completed running processes for target "{}" in {} seconds'.format( target.__name__, str(delta_time))) return results