def test_acm_by_lid_gid(acms, sample_lids, sample_gids, data): status = 0 print '===================================================================' print '=================== TEST ACM BY LID AND GID =======================' print '===================================================================' for node in acms: if node == '': continue slid = data[node][LID] print 'Testing %s with %d LIDs' % (node, len(sample_lids)) (rc, out0) = ib_acme_get_counters(node) print 'Before LID test', out0 for lid in sample_lids: status = test_acm_by_lid_query(node, slid, lid) if status != 0: break (rc, out0) = ib_acme_get_counters(node) print 'After LID test\n', out0 (_, sgid) = ssa_tools_utils.execute_on_remote( "/usr/sbin/ibaddr |awk '{print $2}'", node) print 'Testing %s with %d GIDs' % (node, len(sample_gids)) (rc, out0) = ssa_tools_utils.execute_on_remote('%s -P ' % ib_acme, node) print 'Before GID test\n', out0 for gid in sample_gids: status = test_acm_by_gid_query(node, sgid, gid) if status != 0: break if status != 0: break (rc, out0) = ib_acme_get_counters(node) print 'After GID test\n', out0 print 'Run on %d nodes, each to %d lids and %d gids' % ( len(acms), len(sample_lids), len(sample_gids)) print '===================================================================' print '========= TEST ACM BY LID AND GID COMPLETE (status: %d) ===========' % ( status) print '===================================================================' return status
def change_and_load_ip(cores, old_ip, new_ip): for core in cores: (_, out) = ssa_tools_utils.execute_on_remote( "sed -i 's/^%s/%s/g' %s" % (old_ip, new_ip, CORE_PRELOAD_FILE_PATH), core) (_, out) = ssa_tools_utils.execute_on_remote("kill -s HUP `pidof opensm`", core) return 0
def get_node_remote(node): # # HACK: it is assumed that node machine is connected to a remote node with port 1 # (rc, out) = ssa_tools_utils.execute_on_remote('smpquery PI -D 0,1 | grep ^Lid', node) remote_lid = out.split('.')[-1].rsplit('\n')[0] (rc, out) = ssa_tools_utils.execute_on_remote( 'smpquery NI -D 0,1 | grep LocalPort', node) remote_port = out.split('.')[-1].rsplit('\n')[0] return (remote_lid, remote_port)
def get_node_ip(node, node_active_interface): (_, ip) = ssa_tools_utils.execute_on_remote("ip address show dev %s | grep 'inet ' \ | awk '{print $2}' | cut -f1 -d'/' \ | tr -d '\n'" \ % (node_active_interface), node) return ip
def get_node_ip_mask(node, node_active_interface): (_, mask) = ssa_tools_utils.execute_on_remote("ifconfig %s | grep Mask \ | awk '{print $4}' | tr -d '\n'" \ % (node_active_interface), node) return mask
def get_active_ib_interface(node): # assumes only 2 ports, called ib0 and ib1 (rc, out) = ssa_tools_utils.execute_on_remote( 'ibportstate -D 0 1 | grep LinkUp', node) if len(out) > 0: return 'ib0' return 'ib1'
def change_node_ip(node, new_ip, new_netmask): active_interface = get_active_ib_interface(node) (rc, ret) = ssa_tools_utils.execute_on_remote( 'ifconfig %s %s netmask %s' % (active_interface, new_ip, new_netmask), node) # assumes reconfiguring doesn't fail FIXME return 0
def sanity_test_0(cores, als, acms, lids, gids, ipv4s, ipv6s, data): hostname = commands.getoutput('hostname') slid = commands.getoutput( "/usr/sbin/ibstat |grep -a5 Act|grep Base|awk '{print $NF}'").rstrip( '\n') osmlid = commands.getoutput( "/usr/sbin/ibstat |grep -a5 Act|grep SM|awk '{print $NF}'").rstrip( '\n') osmgid = commands.getoutput( "/usr/sbin/saquery --src-to-dst %s:%s|grep dgid" % (slid, osmlid)).split('.')[-1] if len(osmlid.split() + slid.split() + osmgid.split() + hostname.split()) != 4: print 'Failed to get basic info' print "/usr/sbin/ibstat |grep -a5 Act|grep SM|awk '{print $NF}'\n%s" % osmlid print "/usr/sbin/ibstat |grep -a5 Act|grep Base|awk '{print $NF}'\n%s" % slid print "/usr/sbin/saquery --src-to-dst %s:%s|grep dgid\n%s" % ( slid, osmlid, osmgid) print "hostname\n%s" % hostname sys.exit(1) print '===================================================================' print '========================= SANITY TEST 0 ===========================' print '===================================================================' # Initial ib_acme query in order to make sure there was PRDB update for node in acms: if node == '': continue (_, sgid) = ssa_tools_utils.execute_on_remote( "/usr/sbin/ibaddr |awk '{print $2}'", node) slid = data[node][LID] (_, _) = ib_acme_query('g', osmgid, sgid, '-c -v', node) (_, _) = ib_acme_query('l', osmlid, slid, '-c -v', node) time.sleep(10) sample_gids = random.sample(gids, min(len(gids), sample_size)) sample_lids = random.sample(lids, min(len(lids), sample_size)) sample_ipv4s = random.sample(ipv4s, min(len(ipv4s), sample_size)) sample_ipv6s = random.sample(ipv6s, min(len(ipv6s), sample_size)) status = test_acm_by_lid_gid(acms, sample_lids, sample_gids, data) if status != 0: return status status = test_ip(acms, sample_ipv4s, sample_ipv6s) if status != 0: return status print '===================================================================' print '==================== SANITY TEST 0 COMPLETE =======================' print '===================================================================' return status
def get_logs_from_node(node, dest_dir, node_type): node_dest = '%s/%s' % (dest_dir, node) if not os.path.exists(node_dest): os.mkdir(node_dest) os.system('chmod 777 -R %s' % node_dest) get_system_info(node, dest_dir) for file in file_list[node_type]: (_, _) = ssa_tools_utils.execute_on_remote( 'cp -r %s %s > /dev/null' % (file, node_dest), node) f = '%s/%s' % (node_dest, os.path.basename(file)) if os.path.exists(f) and f.endswith('cfg'): for line in open(f, 'r'): if not line.startswith('#') and line.find('_dir') >= 0: local_dirs = '%s*' % line.split()[1] (_, _) = ssa_tools_utils.execute_on_remote( 'cp -r %s %s > /dev/null' % (local_dirs, node_dest), node) cmds = { 'rdma': 'cp -r /etc/rdma %s > /dev/null' % node_dest, 'dmesg': '/bin/dmesg -T > %s/%s_dmesg.log' % (node_dest, node), 'misc_files': 'cp -r %s %s > /dev/null' % (' '.join(file_list['other_files']), node_dest), 'libs': "ldd `which ibssa ibacm opensm` | awk '{print $3}' | grep '/' | sort | uniq" } for cmd_type in cmds.keys(): (_, out) = ssa_tools_utils.execute_on_remote(cmds[cmd_type], node) if cmd_type == 'libs': libs = ' '.join(out.split()) (_, _) = ssa_tools_utils.execute_on_remote( 'cp -rL %s %s > /dev/null' % (libs, node_dest), node) print '%s finished log collection %s' % (node, dest_dir) return 0
def test_acm_by_lid_query(node, slid, dlid, initial_query=0, print_err=1): status = 0 if initial_query == 1: print 'Executing initial ib_acme query on %s (lid %s) node' % (node, slid) (rc, out) = ib_acme_query('l', dlid, slid, '-c', node) time.sleep(10) print '%s -f l -d %s -s %s -c -v' % (ib_acme, dlid, slid), node (rc, out0) = ib_acme_get_counters(node) (rc, out) = ib_acme_query('l', dlid, slid, '-c -v', node) # print out if out.find('failed') >= 0 and out.find('success') < 0: if print_err == 1: print 'ERROR. ACM on %s failed (lid test)' % node (_, o) = ssa_tools_utils.execute_on_remote( '/usr/local/bin/ibv_devinfo', node) print o status = 1 (rc, out1) = ib_acme_get_counters(node) ret = compare_outs(out0, out1, route_cache_count_index) if ret == 0: if print_err == 1: print 'ERROR. %s PR was not taken from cache (lid test)' % node (_, o) = ssa_tools_utils.execute_on_remote( '/usr/local/bin/ibv_devinfo', node) print o status = 2 elif ret == "-1": print 'ERROR. %s failed' % node (_, o) = ssa_tools_utils.execute_on_remote('/usr/local/bin/ibv_devinfo', node) print o status = 3 return status
def get_IPv6_addr(node, node_active_interface): (_, IPv6_lines) = ssa_tools_utils.execute_on_remote( "ip address show dev %s | grep 'inet6'" % (node_active_interface), node) num_lines = IPv6_lines.count('\n') if num_lines > 1: # FIXME: necessary? print 'ERROR: node %s interface %s has more than 1 IPv6 address' % ( node, node_active_interface) return -1 if num_lines < 1: #FIXME: same print 'ERROR: node %s interface %s has no IPv6 address' % ( node, node_active_interface) return "-1" (_, IPv6) = ssa_tools_utils.execute_on_remote("ip address show dev %s | grep 'inet6' \ | awk '{print $2}' | cut -f1 -d'/' \ | tr -d '\n'" \ % (node_active_interface), node) return IPv6
def run_cmd(global_dict, cmd): status = 0 for typ in ['distrib', 'access', 'acm', 'core']: for nodes in ssa_tools_utils.devide_list_chunks( global_dict['%s_nodes' % typ], ssa_tools_utils.MAX_THREAD): for node in nodes: (rc, out) = ssa_tools_utils.execute_on_remote(cmd, node) if int(rc) != 0: status = 1 return status
def check_error(global_dict): status = 0 for osm_node in global_dict.get('core_nodes', 'osm'): ssa_tools_utils.execute_on_remote( 'cat %s | egrep "ERROR|ERR:"' % (ssa_tools_utils.CFG_FILES['osm_logfile']), osm_node) errors = {} for typ in ['distrib', 'access', 'acm']: for nodes in ssa_tools_utils.devide_list_chunks( global_dict['%s_nodes' % typ], ssa_tools_utils.MAX_THREAD): for node in nodes: if node == '': continue (rc, out) = ssa_tools_utils.execute_on_remote( 'cat %s | egrep "ERROR|ERR:|BACKTRACE" | grep -v "ERROR 111"' % (ssa_tools_utils.CFG_FILES['%s_logfile' % typ]), node) if out != "": print '%s %s \n%s' % (typ, node, out) errors[node] = out status = 1 (rc, out) = ssa_tools_utils.execute_on_remote( '/usr/sbin/ibstat | grep -i active', node) out = out.rstrip('\n') if int(rc) != 0 or out == '': print 'ERROR. Check ib modules on %s:%s' % (node, out) #status = 2 #errors[node] = out if status == 0: print 'Report: No errors found in logs' else: print 'Report: Found errors found on' for h, e in errors.iteritems(): if len(e) == 0: continue print '*************** %s ***********************' % h print e return status
def kcache_ip_lookup(node, active_interface, addr_to_search, entry_type): status = 0 (rc, out) = ssa_tools_utils.execute_on_remote( 'ip neigh show dev %s %s' % (active_interface, addr_to_search), node) if len(out) == 0: print 'ERROR: ip %s not found in node %s cache' % (addr_to_search, node) status = 2 return status if out.split()[-1] != entry_type: print 'ERROR: ip %s node %s cache is not %s' % (addr_to_search, node, entry_type) status = 2 return status
def node_port_state_change(node, state): net = read_ibnetdiscover() pprint(net) (rc, out) = ssa_tools_utils.execute_on_remote( '/usr/sbin/ibstat |grep Node |grep GUID', node) try: SW_GID = net[out.split()[-1]][0] SW_PORT = net[out.split()[-1]][1] o = commands.getoutput('ibportstate -G %s %s %s' % (SW_GID, SW_PORT, state)) print '[%s] ibportstate -G %s %s %s\n%s' % ( time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, state, o) return "%s %s" % (SW_GID, SW_PORT) except: return None
def get_system_info(node, dest_dir): commands = [ 'date', 'df -lh', 'ibssa -v', 'ibacm -v', 'uname -a', 'cat /etc/issue', '/usr/sbin/ibstat', 'opensm --version | grep OpenSM', 'cat /etc/sysctl.conf | grep core_pattern' ] node_info_file = '%s/%s.info' % (dest_dir, node) f = open(node_info_file, 'w') for cmd in commands: (_, out) = ssa_tools_utils.execute_on_remote(cmd, node) f.write("#%s\n%s\n\n" % (cmd, out)) f.close() return 0
def test_0_0_0(ibmsnet): test_header = inspect.getframeinfo(inspect.currentframe()).function phase = 'Reconnect Test' rch_global_dict['test_description'][test_header] = phase status = 0 net = ssa_utils.read_ibnetdiscover() pprint(net) stop_time = time.time() + rch_global_dict['timeout'] while ( stop_time - time.time() > 0): acm_to_disconnect = random.choice(rch_global_dict['acm_nodes']) delay = random.randint(0, rch_global_dict['max_interval'] ) (rc, out) = ssa_tools_utils.execute_on_remote('/usr/sbin/ibstat|grep Node |grep GUID', acm_to_disconnect) print out print acm_to_disconnect print net print out.split() print 'Disconnecting %s %s ' % (acm_to_disconnect, net[out.split()[-1]]) SW_GID = net[out.split()[-1]][0] SW_PORT = net[out.split()[-1]][1] time.sleep(delay) print 'Report: Reconnect node %s after %d ' % ( acm_to_disconnect, delay) o = commands.getoutput('/usr/sbin/ibportstate -G %s %s disable' % ( SW_GID, SW_PORT )) print '[%s] /usr/sbin/ibportstate -G %s %s disable\n%s' % ( time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, o ) delay = random.randint(rch_global_dict['min_interval'], rch_global_dict['max_interval'] ) time.sleep(delay) o = commands.getoutput('/usr/sbin/ibportstate -G %s %s enable' % ( SW_GID, SW_PORT )) print '[%s] /usr/sbin/ibportstate -G %s %s enable\n%s' % ( time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, o ) for cmd in ['%s/maintain.py -t %s --setup status > %s/ssa_status.log' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']) , '%s/maintain.py -t %s -e > %s/ssa_errors.log' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']), '%s/logcollector.py -t %s -o %s' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']) ]: print cmd o = commands.getoutput(cmd) o = commands.getoutput("cat %s/ssa_errors.log" % rch_global_dict['log_dir']) o_status = commands.getoutput("cat %s/ssa_status.log" % rch_global_dict['log_dir']) if o.find('Found errors found on') >= 0 or o_status.find('STOP') >= 0: print 'There are errors in %s/ssa_errors.log' % rch_global_dict['log_dir'] status = 1 print 'See logs in %s' % rch_global_dict['log_dir'] test_report(test_header, phase, status) return status
def get_distribution_tree(master_sm_node): t = {} timeout = 300 start_time = time.time() while (time.time() - start_time <= timeout): print "%s>#cat %s |grep -v %s" % ( master_sm_node, ssa_tools_utils.CFG_FILES['plugin_logfile'], time.strftime("%b")) (rc, out) = ssa_tools_utils.execute_on_remote( "cat %s |grep -v %s" % (ssa_tools_utils.CFG_FILES['plugin_logfile'], time.strftime("%b")), master_sm_node) tree = out.encode('ascii', 'ignore').split( 'General SSA distribution tree info')[-1].split('\n') print '[%s] Unfiltered Distribution tree looks like\n%s' % ( master_sm_node, tree) if len(tree) > 1: return tree return tree
def node_port_bounce(node, delay=0): if node == commands.getoutput('hostname'): print 'Unable to execute port bounce on local node %s' % node return 1 net = read_ibnetdiscover() (rc, out) = ssa_tools_utils.execute_on_remote( '/usr/sbin/ibstat |grep Node |grep GUID', node) SW_GID = net[out.split()[-1]][0] SW_PORT = net[out.split()[-1]][1] o = commands.getoutput('ibportstate -G %s %s disable' % (SW_GID, SW_PORT)) print '[%s] ibportstate -G %s %s disable\n%s' % ( time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, o) time.sleep(delay) o = commands.getoutput('ibportstate -G %s %s enable' % (SW_GID, SW_PORT)) print '[%s] ibportstate -G %s %s enable\n%s' % ( time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, o) return 0
def ib_acme_query(addr_format, dest, src, additional_flags, node): str_to_exec = ib_acme + ' -f ' + addr_format + ' -d ' + str(dest) + \ ' -s ' + str(src) + ' ' + additional_flags return ssa_tools_utils.execute_on_remote(str_to_exec, node)
def test_0_0_0(ibmsnet): test_header = inspect.getframeinfo(inspect.currentframe()).function phase = 'Reboot Test' rch_global_dict['test_description'][test_header] = phase status = 0 nodes = [] for typ in [ 'acm', ]: #for typ in [ 'core', 'distrib', 'access', 'acm' ]: for node in rch_global_dict['%s_nodes' % typ]: nodes.append({node: typ}) random.shuffle(nodes) stop_time = time.time() + rch_global_dict['timeout'] while (stop_time - time.time() > 0): n = random.choice(nodes) node = n.keys()[0] delay = random.randint(rch_global_dict['min_interval'], rch_global_dict['max_interval']) print 'Restarting %s after %d s' % (str(n), delay) (s, o) = ssa_tools_utils.execute_on_remote('reboot', node) time.sleep(120) #Wait for boot r = rch_global_dict['boot_retries'] print 'Waiting for %s to boot\n%s\n%s' % (node, s, o) (s, _) = ssa_tools_utils.execute_on_remote('uptime', node) while (s > 0): print 'Waiting for %s to boot\n%s\n%s' % (node, s, o) (s, _) = ssa_tools_utils.execute_on_remote('uptime', node) time.sleep(10) r = r - 1 if (r <= 0): print 'ERROR %s still down' % node status = 1 test_report(test_header, phase, status) return status #Start SSA s = ssa_tools_utils._ssa_action(node, 'start', n[node]) #Start ib_acme (s, o) = ssa_tools_utils.execute_on_remote('pgrep ib_stress.sh', node) if s > 0: if ssa_tools_utils.start_ib_acme( node, rch_global_dict['ib_acme_delay'], rch_global_dict['ib_acme_num']) > 0: print 'ERROR to execute ib_acme on %s' % node status = 1 break else: print 'ERROR ib_acme already running on %s' % node break delay = random.randint(rch_global_dict['min_interval'], rch_global_dict['max_interval']) time.sleep(delay) for node in nodes: ssa_tools_utils.stop_ib_acme(node.keys()) for cmd in [ '%s/maintain.py -t %s --setup status > %s/ssa_status.log' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']), '%s/maintain.py -t %s -e > %s/ssa_errors.log' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']), '%s/logcollector.py -t %s -o %s' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']) ]: print cmd o = commands.getoutput(cmd) # o = commands.getoutput("cat %s/ssa_errors.log" % rch_global_dict['log_dir']) # o_status = commands.getoutput("cat %s/ssa_status.log" % rch_global_dict['log_dir']) # if o.find('Found errors found on') >= 0 or o_status.find('STOP') >= 0: # print 'There are errors in %s/ssa_errors.log' % rch_global_dict['log_dir'] # status = 1 print 'See logs in %s' % rch_global_dict['log_dir'] test_report(test_header, phase, status) return status
nodes = ['ko0003','ko0006','ko0011','ko0013','ko0018','ko0026','ko0027','ko0028','ko0031','ko0033','ko0034','ko0036','ko0040','ko0043','ko0048','ko0050','ko0051','ko0053','ko0055','ko0057','ko0059','ko0060','ko0061','ko0063','ko0067','ko0069','ko0070','ko0074','ko0076','ko0079','ko0080','ko0082','ko0085','ko0087','ko0088','ko0090','ko0096','ko0098','ko0099','ko0101','ko0103','ko0107','ko0111','ko0114','ko0116','ko0125','ko0128','ko0129','ko0134','ko0141','ko0144','ko0145','ko0148','ko0149','ko0150','ko0152','ko0154','ko0156','ko0157','ko0158','ko0162','ko0164','ko0166','ko0168','ko0170','ko0174','ko0178','ko0181','ko0185','ko0190','ko0192','ko0195','ko0197','ko0200','ko0203','ko0205','ko0207','ko0209','ko0210','ko0211','ko0213','ko0214','ko0217','ko0218','ko0223','ko0228','ko0229','ko0231','ko0235','ko0237','ko0239','ko0242','ko0249','ko0250','ko0252','ko0253','ko0255','ko0258','ko0261','ko0265','ko0268','ko0272','ko0274','ko0275','ko0277','ko0278','ko0281','ko0282','ko0283','ko0285','ko0286','ko0288','ko0289','ko0291','ko0294','ko0295','ko0297','ko0298','ko0300','ko0302','ko0304','ko0305','ko0306','ko0307','ko0309','ko0315','ko0319','ko0320','ko0322','ko0323','ko0324','ko0327','ko0328','ko0331','ko0332','ko0333','ko0335','ko0337','ko0339','ko0347','ko0351','ko0355','ko0357','ko0358','ko0359','ko0362','ko0364','ko0365','ko0366','ko0367','ko0368','ko0369','ko0370','ko0371','ko0373','ko0379','ko0380','ko0382','ko0383','ko0387','ko0392','ko0395','ko0397','ko0399','ko0402','ko0406','ko0411','ko0413','ko0419','ko0422','ko0425','ko0430','ko0436','ko0440','ko0442','ko0443','ko0444','ko0445','ko0449','ko0451','ko0454','ko0458','ko0462','ko0470','ko0471','ko0474','ko0475','ko0478','ko0479','ko0482','ko0483','ko0485','ko0487','ko0489','ko0490','ko0492'] errors=[] sum={'ko000000' : ['virtual_name', 'node_guid','sys_image_guid' ,'port_lid'] } o = commands.getoutput('ssh lennyb@ko-ops "/proj/SSA/Mellanox/scripts/node_list -P -e ssa,ssauniversal"') for n in o.split('\n'): l = n.split() if len(l) == 0: continue sum[l[0]] = [] sum[l[0]].append(l[1]) for node in nodes: try: (_, o) = ssa_tools_utils.execute_on_remote("ibv_devinfo | egrep \"node_guid|sys_image_guid|port_lid\"|awk '{print $2}'", node) o = o.encode('ascii','ignore').rstrip('\n') if len(o) == 0: errors.append(node) else: sum[node].append(o.split('\n')) except: print 'Failed on %s' % node errors.append(node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'ulimit -c unlimited\'>> /root/.bashrc"', node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_uverbs\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe rdma_cm\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_addr\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe rdma_ucm\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_uverbs\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo `hostname` mthca0 > /sys/class/infiniband/mthca0/node_desc"'','node)
def check_setup(global_dict): report = {} status = 0 for typ in ['core', 'acm', 'distrib', 'access']: for nodes in ssa_tools_utils.devide_list_chunks( global_dict['%s_nodes' % typ], ssa_tools_utils.MAX_THREAD): for node in nodes: if node == '': continue c = eval('ssa_tools_utils.%s' % typ)(node) try: rc = c.get_status() except: rc = 1 report[node] = [typ] if int(rc) == 0: report[node].append('RUNNING') else: report[node].append('STOPPED') status = 1 try: if typ == 'acm': (rc, version) = ssa_tools_utils.execute_on_remote( 'ibacm -v', node) if rc != 0: version = 'unknown' else: (_, version) = ssa_tools_utils.execute_on_remote( 'ibssa -v', node) version = version.split()[-1] except: version = 'unknown' try: (rc, lid) = ssa_tools_utils.execute_on_remote( "/usr/sbin/ibstat|egrep -a5 \"Act|Initializing\"|grep Base| awk '{print $NF}'", node) except: lid = 'None' report[node].append(lid.rstrip('\n').encode('ascii', 'ignore')) (rc, gid) = ssa_tools_utils.execute_on_remote( "/usr/sbin/ibaddr|awk '{print $2}'", node) report[node].append(gid.rstrip('\n').encode('ascii', 'ignore')) report[node].append(version) (a, b) = ssa_tools_utils.execute_on_remote( "/usr/sbin/ibstat|egrep -a5 \"Act|Initializing\"|grep Base| awk '{print $NF}';/usr/sbin/ibaddr|awk '{print $2}';ibssa -v;ibacm -v", node) print "************* check_setup ********************" print "node: [ssa_type, status, lid, gid, version]" sum = {} for n in sorted(report.keys()): print '%s: %s' % (n, str(report[n])) if not sum.has_key(report[n][0]): sum[report[n][0]] = 1 else: sum[report[n][0]] += 1 print 'Running %s summary' % global_dict['topology'] pprint(sum) status_file = '/tmp/%s_%s_status.json' % (time.strftime("%Y%M%d_%H%M%S"), global_dict['topology']) f = open(status_file, 'w') json.dump(report, f) print "Saved under %s \n***********************************************" % status_file return status
def ib_acme_get_counters(node): str_to_exec = ib_acme + ' -P ' return ssa_tools_utils.execute_on_remote(str_to_exec, node)
ssa_tools_utils.rm_exec('echo "%s" | sudo crontab -' % line, nodes) #ssa_tools_utils.rm_exec('echo "" | sudo -u lennyb crontab -' , nodes) #ssa_tools_utils.rm_exec("sed -i 's/tcsh/bash/g' /etc/passwd", nodes) #ssa_tools_utils.rm_exec('-- sh -c "echo kernel.core_pattern=/tmp/core.%e.%p.%h.%t >> /etc/sysctl.conf"' , nodes) #ssa_tools_utils.rm_exec('-- sh -c "echo \'StrictHostKeyChecking no\'>> /etc/ssh/ssh_config"', nodes) #ssa_tools_utils.rm_exec('sysctl -p /etc/sysctl.conf', nodes) sys.exit(1) if len(sys.argv) == 2: print 'Execute on parallel' ssa_tools_utils.rm_exec(sys.argv[1:], nodes) else: for node in nodes: try: (rc, o) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'StrictHostKeyChecking no\'>> /etc/ssh/ssh_config"', node) #(rc, o) = ssa_tools_utils.execute_on_remote('date', node) #(rc, o) = ssa_tools_utils.execute_on_remote(sys.argv[1:], node) print '[%s] %s' % (node, o) except: print 'Failed on %s' % node errors.append(node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'ulimit -c unlimited\'>> /root/.bashrc"', node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_uverbs\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe rdma_cm\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_addr\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe rdma_ucm\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_uverbs\'>> /etc/profile"'',' node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo `hostname` mthca0 > /sys/class/infiniband/mthca0/node_desc"'','node) # (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'echo `hostname` mthca0 > /sys/class/infiniband/mthca0/node_desc\'>>/etc/profile "'','node)
acms = [ hostname, ] sample_gids = gids sample_lids = lids else: sample_gids = random.sample(gids, min(len(gids), sample_size)) sample_lids = random.sample(lids, min(len(lids), sample_size)) for node in acms: slid = data[node][LID] print '%s %s' % (node, slid) for node in acms: if node == '': continue (_, sgid) = ssa_tools_utils.execute_on_remote( "/usr/sbin/ibaddr |awk '{print $2}'", node) slid = data[node][LID] (_, _) = ssa_tools_utils.execute_on_remote( '%s -f g -d %s -s %s -c -v' % (ib_acme, osmgid, sgid), node) (_, _) = ssa_tools_utils.execute_on_remote( '%s -f l -d %s -s %s -c -v' % (ib_acme, osmlid, slid), node) time.sleep(10) print 'Testing %s with %d GIDs' % (node, len(sample_gids)) (rc, out0) = ssa_tools_utils.execute_on_remote('%s -P ' % ib_acme, node) print 'Before GID test\n', out0 for gid in sample_gids: print '%s# %s -f g -d %s -s %s -c -v' % (node, ib_acme, gid, sgid) (rc, out0) = ssa_tools_utils.execute_on_remote('%s -P ' % ib_acme, node)