def update_path(self, prefix=None): if prefix is not None: self._prefix = prefix log.debug('iface{}: current path: {}'.format(self.num, self.iface_dir)) self.iface_dir = list( (self.comp.path / self._prefix / 'interface').glob( 'interface{}@*'.format(self.num)))[0] log.debug('iface{}: new path: {}'.format(self.num, self.iface_dir)) self.update_lprt_dir() self.update_vcat_dir()
def get_endpoints(self, consumers): add_endpoints = [] rm_endpoints = [] for con in consumers: try: add_endpoints.append(self.add_callback[con]) rm_endpoints.append(self.remove_callback[con]) except KeyError: log.debug('consumer {} has no subscribed endpoint'.format(con)) # end for return (add_endpoints, rm_endpoints)
def get_peer_cclass(self, iface): genz = zephyr_conf.genz # Revisit: should this re-read value? # Unlike cstate & gcid, unless there's re-cabling, this can't change peer_state = genz.PeerState(iface.PeerState, iface) log.debug( '{}: get_peer_cclass[{}]: PeerBaseCClassValid={}, PeerCClass={}'. format(self.comp.gcid, self.num, peer_state.field.PeerBaseCClassValid, iface.PeerBaseCClass)) return (iface.PeerBaseCClass if peer_state.field.PeerBaseCClassValid == 1 else None)
def iface_unusable(self, iface): # lookup impacted routes impacted = self.routes.impacted(iface) for rt in impacted: log.debug('route {} impacted by unusable {}'.format(rt, iface)) # route around failed link (if possible) try: self.teardown_routing(rt.fr, rt.to, rt) self.setup_routing(rt.fr, rt.to) except nx.exception.NetworkXNoPath: # no valid route anymore, remove unreachable comp rt.fr.unreachable_comp(rt.to, iface, rt)
def get_peer_cstate(self, iface): genz = zephyr_conf.genz # Re-read PeerState self.comp.control_read(iface, genz.InterfaceStructure.PeerState, sz=4, off=4) peer_state = genz.PeerState(iface.PeerState, iface) peer_cstate = CState(peer_state.field.PeerCState) log.debug('{}: get_peer_c_state[{}]: PeerCState={!s}'.format( self.comp.gcid, self.num, peer_cstate)) return peer_cstate
def iface_read(self, prefix='control'): self.setup_paths(prefix) iface_file = self.iface_dir / 'interface' with iface_file.open(mode='rb+') as f: data = bytearray(f.read()) iface = self.comp.map.fileToStruct('interface', data, fd=f.fileno(), verbosity=self.comp.verbosity) log.debug('{}: interface{}={}'.format(self.comp.gcid, self.num, iface)) self.hvs = iface.HVS # for num_vcs() # end with return iface
def send_peer_attr1(self, iface, timeout=10000): genz = zephyr_conf.genz icontrol = genz.IControl(iface.IControl, iface) icontrol.field.PeerAttr1Req = 1 iface.IControl = icontrol.val log.debug('{}: sending Peer-Attr1'.format(self)) self.comp.control_write(iface, genz.InterfaceStructure.IControl, sz=4, off=4) status = self.wait_link_ctl(iface, timeout) icontrol.field.PeerAttr1Req = 0 iface.IControl = icontrol.val return status
def get_peer_gcid(self, iface): genz = zephyr_conf.genz # Re-read PeerCID/PeerSID/PeerState self.comp.control_read(iface, genz.InterfaceStructure.PeerCID, sz=8) peer_state = genz.PeerState(iface.PeerState, iface) peer_cid = iface.PeerCID if peer_state.field.PeerCIDValid else None peer_sid = (iface.PeerSID if peer_state.field.PeerSIDValid else self.comp.gcid.sid) try: peer_gcid = GCID(sid=peer_sid, cid=peer_cid) except TypeError: peer_gcid = None log.debug('{}: get_peer_gcid[{}]: PeerGCID={}'.format( self.comp.gcid, self.num, peer_gcid)) return peer_gcid
def wait_link_ctl(self, iface, timeout): genz = zephyr_conf.genz istatus = genz.IStatus(iface.IStatus, iface) start = time.time_ns() done = False while not done: self.comp.control_read(iface, genz.InterfaceStructure.IStatus, sz=4) istatus.val = iface.IStatus log.debug('{}: wait_link_ctl[{}]: completed={}, status={}'.format( self.comp.gcid, self.num, istatus.field.LinkCTLCompleted, istatus.field.LinkCTLComplStatus)) now = time.time_ns() done = (((now - start) > timeout) or (istatus.field.LinkCTLCompleted == 1)) return istatus.field.LinkCTLComplStatus
def check_i_state(self, iface, timeout=500000000, do_read=True): genz = zephyr_conf.genz istatus = genz.IStatus(iface.IStatus, iface) start = time.time_ns() done = False while not done: if do_read: self.comp.control_read(iface, genz.InterfaceStructure.IStatus, sz=4) istatus.val = iface.IStatus istate = IState(istatus.field.IState) log.debug('{}: check_i_state[{}]: state={}'.format( self.comp.gcid, self.num, istate)) now = time.time_ns() # Revisit: allow caller to pass in list of expected states done = (((now - start) > timeout) or (istate in [IState.IUp, IState.ILP])) self.istate = istate return istate
def ierror_init(self, iface, icap1): if icap1.field.IfaceErrFieldsSup == 0: return genz = zephyr_conf.genz # Set IErrorSigTgt ierr_tgt = genz.IErrorSigTgt(iface.IErrorSigTgt, iface) sig_tgt = genz.SigTgt.TgtIntr1 if self.comp.local_br else genz.SigTgt.TgtUEP ierr_tgt.field.ExcessivePHYRetraining = sig_tgt ierr_tgt.field.NonTransientLinkErr = sig_tgt ierr_tgt.field.IfaceContainment = sig_tgt ierr_tgt.field.IfaceFCFwdProgressViolation = sig_tgt ierr_tgt.field.UnexpectedPHYFailure = sig_tgt ierr_tgt.field.IfaceAE = sig_tgt ierr_tgt.field.SwitchPktRelayFailure = sig_tgt iface.IErrorSigTgt = ((ierr_tgt.val[2] << 32) | (ierr_tgt.val[1] << 16) | ierr_tgt.val[0]) log.debug('{}: writing IErrorSigTgt'.format(self)) # Revisit: at least on orthus, sz=6 turns into an 8-byte ControlWrite self.comp.control_write(iface, genz.InterfaceStructure.IErrorSigTgt, sz=6) # Set IErrorDetect - last, after other IError fields setup ierr_det = genz.IErrorDetect(iface.IErrorDetect, iface) ierr_det.field.ExcessivePHYRetraining = 1 ierr_det.field.NonTransientLinkErr = 1 ierr_det.field.IfaceContainment = 1 ierr_det.field.IfaceFCFwdProgressViolation = 1 ierr_det.field.UnexpectedPHYFailure = 1 ierr_det.field.IfaceAE = 1 ierr_det.field.SwitchPktRelayFailure = 1 # Revisit: other interface errors iface.IErrorDetect = ierr_det.val log.debug('{}: writing IErrorDetect'.format(self)) # Revisit: switch doesn't like sz=2, off=2, because at least on orthus # that turns into a 4-byte ControlWrite to a 2-byte-aligned addr #self.comp.control_write(iface, # genz.InterfaceStructure.IErrorDetect, sz=2, off=2) # Revisit: major side-effect - IErrorStatus is cleared (bits are RW1CS) self.comp.control_write(iface, genz.InterfaceStructure.IErrorStatus, sz=8)
def setup_routing(self, fr: Component, to: Component, write_ssdt=True, routes=None) -> List[Route]: try: cur_rts = self.routes.get_routes(fr, to) except KeyError: cur_rts = [] new_rts = [] for route in self.find_routes(fr, to, routes=routes, max_routes=zephyr_conf.args.max_routes): if route in cur_rts: log.debug('skipping existing route {}'.format(route)) else: log.info('adding route from {} to {} via {}'.format( fr, to, route)) self.write_route(route, write_ssdt) self.routes.add(fr, to, route) new_rts.append(route) return new_rts
def phy_init(self): # This does not actually init anything - it only checks PHY status # Revisit: handle multiple PHYs # Revisit: interface phy struct is mandatory, but does not exist # for links between hemispheres in current dual-hemisphere switch try: phy_dir = list( (self.iface_dir / 'interface_phy').glob('interface_phy0@*'))[0] phy_file = phy_dir / 'interface_phy' with phy_file.open(mode='rb+') as f: data = bytearray(f.read()) phy = self.comp.map.fileToStruct('interface_phy', data, fd=f.fileno(), verbosity=self.comp.verbosity) log.debug('{}: phy{}={}'.format(self.comp.gcid, self.num, phy)) return self.phy_status_ok(phy) except IndexError: log.debug('{}: phy{} missing - assume PHY-Up'.format( self.comp.gcid, self.num)) self.phy_status = PHYOpStatus.PHYUp self.phy_tx_lwr = 0 self.phy_rx_lwr = 0 return True # Revisit
def main(): global args global cols global genz parser = argparse.ArgumentParser() parser.add_argument('-k', '--keyboard', action='count', default=0, help='break to interactive keyboard at certain points') parser.add_argument('-v', '--verbosity', action='count', default=0, help='increase output verbosity') parser.add_argument('-A', '--accept-cids', action='store_true', help='accept pre-existing HW CIDs for all components') parser.add_argument('-r', '--reclaim', action='store_true', help='reclaim C-Up components via reset') parser.add_argument('-M', '--max-routes', action='store', default=None, type=int, help='limit number of routes between components') parser.add_argument('-R', '--random-cids', action='store_true', help='generate random CIDs for all components') parser.add_argument('-S', '--sleep', type=float, default=0.0, help='sleep time inserted at certain points') parser.add_argument('-G', '--genz-version', choices=['1.1'], default='1.1', help='Gen-Z spec version of Control Space structures') parser.add_argument('-P', '--post_mortem', action='store_true', help='enter debugger on uncaught exception') args = parser.parse_args() log.debug('Gen-Z version = {}'.format(args.genz_version)) genz = import_module('genz.genz_{}'.format(args.genz_version.replace('.', '_'))) zephyr_conf.init(args, genz) args_vars = vars(args) log.debug('args={}'.format(args_vars)) nl = NetlinkManager(config='./zephyr-fm/alpaka.conf') map = genz.ControlStructureMap() mgr_uuid = None # by default, generate new mgr_uuid every run conf = Conf('zephyr-fm/zephyr-fabric.conf') try: data = conf.read_conf_file() fab_uuid = UUID(data['fabric_uuid']) if args.reclaim: mgr_uuid = UUID(data['mgr_uuid']) except FileNotFoundError: # create new skeleton file data = {} fab_uuid = uuid4() data['fabric_uuid'] = str(fab_uuid) data['add_resources'] = [] data['boundary_interfaces'] = [] conf.write_conf_file(data) log.debug('conf={}'.format(conf)) fabrics = {} if args.keyboard > 3: set_trace() mainapp = FMServer(conf, 'zephyr', **args_vars) thread = Thread(target=mainapp.run) thread.start() if args.keyboard > 3: set_trace() mp.set_start_method('forkserver') uep_args = { 'genz_version': args.genz_version, 'verbosity': args.verbosity, 'url': 'http://localhost:2021/fabric/uep' } uep_proc = mp.Process(target=netlink_reader, kwargs=uep_args) uep_proc.start() sys_devices = Path('/sys/devices') fab_paths = sys_devices.glob('genz*') for fab_path in sorted(fab_paths): fab = Fabric(nl, map, fab_path, random_cids=args.random_cids, accept_cids=args.accept_cids, fab_uuid=fab_uuid, conf=conf, mgr_uuid=mgr_uuid, verbosity=args.verbosity) fabrics[fab_path] = fab conf.set_fab(fab) if args.keyboard > 1: set_trace() fab.fab_init() log.info('finished exploring fabric {}'.format(fab.fabnum)) if len(fabrics) == 0: log.info('no local Gen-Z bridges found') return if args.keyboard > 3: set_trace() conf.add_resources() # Revisit: multiple fabrics if args.keyboard > 3: set_trace() thread.join()
def update_lprt_dir(self): if self.lprt_dir is None: return self.lprt_dir = list(self.iface_dir.glob('lprt@*'))[0] log.debug('new lprt_dir = {}'.format(self.lprt_dir))
def iface_init(self, prefix='control'): genz = zephyr_conf.genz self.setup_paths(prefix) iface_file = self.iface_dir / 'interface' is_switch = self.comp.has_switch with iface_file.open(mode='rb+') as f: data = bytearray(f.read()) iface = self.comp.map.fileToStruct('interface', data, fd=f.fileno(), verbosity=self.comp.verbosity) log.debug('{}: iface_init interface{}={}'.format( self.comp.gcid, self.num, iface)) self.hvs = iface.HVS if not self.phy_init(): log.info('{}: interface{} is not PHY-Up'.format( self.comp.gcid, self.num)) self.usable = False # Revisit: should config iface even if not PHY-Up return False icap1 = genz.ICAP1(iface.ICAP1, iface) self.ierror_init(iface, icap1) # Revisit: select compatible LLR/P2PNextHdr/P2PEncrypt settings # Revisit: set CtlOpClassPktFiltEnb, if Switch (for now) # enable Explicit OpCodes, and LPRT (if Switch) icap1ctl = genz.ICAP1Control(iface.ICAP1Control, iface) icap1ctl.field.OpClassSelect = 0x1 icap1ctl.field.LPRTEnb = is_switch iface.ICAP1Control = icap1ctl.val log.debug('{}: writing ICAP1Control'.format(self)) self.comp.control_write(iface, genz.InterfaceStructure.ICAP1Control, sz=4, off=4) # set LinkCTLControl (depending on local_br, is_switch) lctl = genz.LinkCTLControl(iface.LinkCTLControl, iface) # xmit bits set on local_br and all switch ports xmit = 1 if (self.comp.local_br or is_switch) else 0 lctl.field.XmitPeerCUpEnb = xmit lctl.field.XmitPeerCResetEnb = xmit lctl.field.XmitPeerEnterLinkUpLPEnb = xmit lctl.field.XmitPeerEnterLinkLPEnb = xmit lctl.field.XmitLinkResetEnb = xmit # Revisit: recv bits should be set everywhere except on local_br # or "hostile" sw ports recv = 0 if (self.comp.local_br) else 1 lctl.field.RecvPeerCUpEnb = recv lctl.field.RecvPeerCResetEnb = recv lctl.field.RecvPeerEnterLinkUpLPEnb = recv lctl.field.RecvPeerEnterLinkLPEnb = recv lctl.field.RecvLinkResetEnb = recv iface.LinkCTLControl = lctl.val log.debug('{}: writing LinkCTLControl'.format(self)) self.comp.control_write(iface, genz.InterfaceStructure.LinkCTLControl, sz=4, off=4) # send Peer-Attribute 1 Link CTL - HW did this at link-up time, # but we don't know when that was, and things may have changed status = self.send_peer_attr1(iface, timeout=100000) if status == 0: log.warning( '{}: send_peer_attr1 timeout on interface{}'.format( self.comp.gcid, self.num)) # Revisit: path time does not currently work in HW # send Path Time Link CTL #status = self.send_path_time(iface, timeout=100000) #if status == 0: # log.warning('{}: send_path_time timeout on interface{}'.format( # self.comp.gcid, self.num)) # save PeerInterfaceID self.peer_iface_num = self.get_peer_iface_num(iface) ictl = genz.IControl(iface.IControl, iface) # set IfaceAKeyValidationEnb (if supported) ictl.field.IfaceAKeyValidationEnb = ( 1 if icap1.field.IfaceAKeyValidationSup else 0) # Revisit: set Ingress/Egress AKeyMask # Revisit: set IngressDREnb only when needed ictl.field.IngressDREnb = 1 # enable interface ictl.field.IfaceEnb = 1 iface.IControl = ictl.val log.debug('{}: writing IControl IfaceEnb'.format(self)) self.comp.control_write(iface, genz.InterfaceStructure.IControl, sz=4, off=4) # clear IStatus RW1C bits that we might care about later istatus = genz.IStatus(0, iface) # all 0 IStatus istatus.field.FullIfaceReset = 1 istatus.field.WarmIfaceReset = 1 istatus.field.LinkRFCStatus = 1 istatus.field.PeerLinkRFCReady = 1 istatus.field.ExceededTransientErrThresh = 1 istatus.field.LUpToLLPTransitionFailed = 1 istatus.field.IfaceContainment = 1 iface.IStatus = istatus.val log.debug('{}: writing IStatus, val={:#x}'.format( self, istatus.val)) self.comp.control_write(iface, genz.InterfaceStructure.IStatus, sz=4, off=0) # verify I-Up state = self.check_i_state(iface) self.usable = (state is IState.IUp) # Revisit: orthus goes I-Down if we do this earlier # set LinkRFCDisable (depending on local_br) ictl.field.LinkRFCDisable = 1 if self.comp.local_br else 0 iface.IControl = ictl.val log.debug('{}: writing IControl LinkRFCDisable'.format(self)) self.comp.control_write(iface, genz.InterfaceStructure.IControl, sz=4, off=4) # save PeerCState & PeerGCID self.peer_cstate = self.get_peer_cstate(iface) self.peer_gcid = self.get_peer_gcid(iface) # end with if is_switch: # initialize VCAT # Revisit: multiple Action columns log.debug('{}: writing VCAT'.format(self)) for vc in range(0, self.hvs + 1): # Revisit: vc policies self.vcat_write(vc, (1 << vc)) log.debug('{}: iface_init done'.format(self)) return self.usable
def update_vcat_dir(self): if self.vcat_dir is None: return self.vcat_dir = list(self.iface_dir.glob('vcat@*'))[0] log.debug('new vcat_dir = {}'.format(self.vcat_dir))