Exemple #1
0
def test_acm_by_lid_gid(acms, sample_lids, sample_gids, data):

    status = 0

    print '==================================================================='
    print '=================== TEST ACM BY LID AND GID ======================='
    print '==================================================================='

    for node in acms:

        if node == '':
            continue

        slid = data[node][LID]

        print 'Testing %s with %d LIDs' % (node, len(sample_lids))
        (rc, out0) = ib_acme_get_counters(node)
        print 'Before LID test', out0

        for lid in sample_lids:
            status = test_acm_by_lid_query(node, slid, lid)
            if status != 0:
                break

        (rc, out0) = ib_acme_get_counters(node)
        print 'After LID test\n', out0

        (_, sgid) = ssa_tools_utils.execute_on_remote(
            "/usr/sbin/ibaddr |awk '{print $2}'", node)

        print 'Testing %s with %d GIDs' % (node, len(sample_gids))
        (rc, out0) = ssa_tools_utils.execute_on_remote('%s -P ' % ib_acme,
                                                       node)
        print 'Before GID test\n', out0

        for gid in sample_gids:

            status = test_acm_by_gid_query(node, sgid, gid)
            if status != 0:
                break

        if status != 0:
            break

        (rc, out0) = ib_acme_get_counters(node)
        print 'After GID test\n', out0

    print 'Run on %d nodes, each to %d lids and %d gids' % (
        len(acms), len(sample_lids), len(sample_gids))

    print '==================================================================='
    print '========= TEST ACM BY LID AND GID COMPLETE (status: %d) ===========' % (
        status)
    print '==================================================================='

    return status
Exemple #2
0
def change_and_load_ip(cores, old_ip, new_ip):

    for core in cores:
        (_, out) = ssa_tools_utils.execute_on_remote(
            "sed -i 's/^%s/%s/g' %s" %
            (old_ip, new_ip, CORE_PRELOAD_FILE_PATH), core)
        (_,
         out) = ssa_tools_utils.execute_on_remote("kill -s HUP `pidof opensm`",
                                                  core)

    return 0
Exemple #3
0
def get_node_remote(node):
    #
    # HACK: it is assumed that node machine is connected to a remote node with port 1
    #
    (rc,
     out) = ssa_tools_utils.execute_on_remote('smpquery PI -D 0,1 | grep ^Lid',
                                              node)
    remote_lid = out.split('.')[-1].rsplit('\n')[0]

    (rc, out) = ssa_tools_utils.execute_on_remote(
        'smpquery NI -D 0,1 | grep LocalPort', node)
    remote_port = out.split('.')[-1].rsplit('\n')[0]

    return (remote_lid, remote_port)
Exemple #4
0
def get_node_ip(node, node_active_interface):

    (_, ip)   = ssa_tools_utils.execute_on_remote("ip address show dev %s | grep 'inet ' \
                                        | awk '{print $2}'  | cut -f1 -d'/' \
                                        | tr -d '\n'" \
                                        % (node_active_interface), node)
    return ip
Exemple #5
0
def get_node_ip_mask(node, node_active_interface):

    (_, mask) = ssa_tools_utils.execute_on_remote("ifconfig %s | grep Mask \
                                        | awk '{print $4}' | tr -d '\n'" \
                                        % (node_active_interface), node)

    return mask
Exemple #6
0
def get_active_ib_interface(node):

    # assumes only 2 ports, called ib0 and ib1
    (rc, out) = ssa_tools_utils.execute_on_remote(
        'ibportstate -D 0 1 | grep LinkUp', node)
    if len(out) > 0:
        return 'ib0'
    return 'ib1'
Exemple #7
0
def change_node_ip(node, new_ip, new_netmask):

    active_interface = get_active_ib_interface(node)
    (rc, ret) = ssa_tools_utils.execute_on_remote(
        'ifconfig %s %s netmask %s' % (active_interface, new_ip, new_netmask),
        node)
    # assumes reconfiguring doesn't fail  FIXME
    return 0
Exemple #8
0
def sanity_test_0(cores, als, acms, lids, gids, ipv4s, ipv6s, data):

    hostname = commands.getoutput('hostname')
    slid = commands.getoutput(
        "/usr/sbin/ibstat |grep -a5 Act|grep Base|awk '{print $NF}'").rstrip(
            '\n')
    osmlid = commands.getoutput(
        "/usr/sbin/ibstat |grep -a5 Act|grep SM|awk '{print $NF}'").rstrip(
            '\n')
    osmgid = commands.getoutput(
        "/usr/sbin/saquery --src-to-dst %s:%s|grep dgid" %
        (slid, osmlid)).split('.')[-1]

    if len(osmlid.split() + slid.split() + osmgid.split() +
           hostname.split()) != 4:
        print 'Failed to get basic info'
        print "/usr/sbin/ibstat |grep -a5 Act|grep SM|awk '{print $NF}'\n%s" % osmlid
        print "/usr/sbin/ibstat |grep -a5 Act|grep Base|awk '{print $NF}'\n%s" % slid
        print "/usr/sbin/saquery --src-to-dst %s:%s|grep dgid\n%s" % (
            slid, osmlid, osmgid)
        print "hostname\n%s" % hostname
        sys.exit(1)

    print '==================================================================='
    print '========================= SANITY TEST 0 ==========================='
    print '==================================================================='

    # Initial ib_acme query in order to make sure there was PRDB update
    for node in acms:
        if node == '':
            continue

        (_, sgid) = ssa_tools_utils.execute_on_remote(
            "/usr/sbin/ibaddr |awk '{print $2}'", node)
        slid = data[node][LID]

        (_, _) = ib_acme_query('g', osmgid, sgid, '-c -v', node)
        (_, _) = ib_acme_query('l', osmlid, slid, '-c -v', node)
        time.sleep(10)

    sample_gids = random.sample(gids, min(len(gids), sample_size))
    sample_lids = random.sample(lids, min(len(lids), sample_size))
    sample_ipv4s = random.sample(ipv4s, min(len(ipv4s), sample_size))
    sample_ipv6s = random.sample(ipv6s, min(len(ipv6s), sample_size))

    status = test_acm_by_lid_gid(acms, sample_lids, sample_gids, data)
    if status != 0:
        return status

    status = test_ip(acms, sample_ipv4s, sample_ipv6s)
    if status != 0:
        return status

    print '==================================================================='
    print '==================== SANITY TEST 0 COMPLETE ======================='
    print '==================================================================='

    return status
Exemple #9
0
def get_logs_from_node(node, dest_dir, node_type):
    node_dest = '%s/%s' % (dest_dir, node)

    if not os.path.exists(node_dest):
        os.mkdir(node_dest)

    os.system('chmod 777 -R %s' % node_dest)

    get_system_info(node, dest_dir)

    for file in file_list[node_type]:
        (_, _) = ssa_tools_utils.execute_on_remote(
            'cp -r %s %s > /dev/null' % (file, node_dest), node)

        f = '%s/%s' % (node_dest, os.path.basename(file))
        if os.path.exists(f) and f.endswith('cfg'):
            for line in open(f, 'r'):
                if not line.startswith('#') and line.find('_dir') >= 0:
                    local_dirs = '%s*' % line.split()[1]
                    (_, _) = ssa_tools_utils.execute_on_remote(
                        'cp -r %s %s > /dev/null' % (local_dirs, node_dest),
                        node)

    cmds = {
        'rdma':
        'cp -r /etc/rdma %s > /dev/null' % node_dest,
        'dmesg':
        '/bin/dmesg -T > %s/%s_dmesg.log' % (node_dest, node),
        'misc_files':
        'cp -r %s %s > /dev/null' %
        (' '.join(file_list['other_files']), node_dest),
        'libs':
        "ldd `which ibssa ibacm opensm` | awk '{print $3}' | grep '/' | sort | uniq"
    }
    for cmd_type in cmds.keys():
        (_, out) = ssa_tools_utils.execute_on_remote(cmds[cmd_type], node)

        if cmd_type == 'libs':
            libs = ' '.join(out.split())
            (_, _) = ssa_tools_utils.execute_on_remote(
                'cp -rL %s %s > /dev/null' % (libs, node_dest), node)

    print '%s finished log collection %s' % (node, dest_dir)

    return 0
Exemple #10
0
def test_acm_by_lid_query(node, slid, dlid, initial_query=0, print_err=1):

    status = 0

    if initial_query == 1:
        print 'Executing initial ib_acme query on %s (lid %s) node' % (node,
                                                                       slid)
        (rc, out) = ib_acme_query('l', dlid, slid, '-c', node)
        time.sleep(10)

    print '%s -f l -d %s -s %s -c -v' % (ib_acme, dlid, slid), node
    (rc, out0) = ib_acme_get_counters(node)
    (rc, out) = ib_acme_query('l', dlid, slid, '-c -v', node)
    # print out

    if out.find('failed') >= 0 and out.find('success') < 0:
        if print_err == 1:
            print 'ERROR. ACM on %s failed (lid test)' % node
            (_, o) = ssa_tools_utils.execute_on_remote(
                '/usr/local/bin/ibv_devinfo', node)
            print o
        status = 1

    (rc, out1) = ib_acme_get_counters(node)

    ret = compare_outs(out0, out1, route_cache_count_index)
    if ret == 0:
        if print_err == 1:
            print 'ERROR. %s PR was not taken from cache (lid test)' % node
            (_, o) = ssa_tools_utils.execute_on_remote(
                '/usr/local/bin/ibv_devinfo', node)
            print o
        status = 2
    elif ret == "-1":
        print 'ERROR. %s failed' % node
        (_,
         o) = ssa_tools_utils.execute_on_remote('/usr/local/bin/ibv_devinfo',
                                                node)
        print o
        status = 3

    return status
Exemple #11
0
def get_IPv6_addr(node, node_active_interface):

    (_, IPv6_lines) = ssa_tools_utils.execute_on_remote(
        "ip address show dev %s | grep 'inet6'" % (node_active_interface),
        node)
    num_lines = IPv6_lines.count('\n')
    if num_lines > 1:  # FIXME: necessary?
        print 'ERROR: node %s interface %s has more than 1 IPv6 address' % (
            node, node_active_interface)
        return -1
    if num_lines < 1:  #FIXME: same
        print 'ERROR: node %s interface %s has no IPv6 address' % (
            node, node_active_interface)
        return "-1"

    (_, IPv6)   = ssa_tools_utils.execute_on_remote("ip address show dev %s | grep 'inet6' \
                                        | awk '{print $2}'  | cut -f1 -d'/' \
                                        | tr -d '\n'" \
                                        % (node_active_interface), node)
    return IPv6
Exemple #12
0
def run_cmd(global_dict, cmd):
    status = 0
    for typ in ['distrib', 'access', 'acm', 'core']:
        for nodes in ssa_tools_utils.devide_list_chunks(
                global_dict['%s_nodes' % typ], ssa_tools_utils.MAX_THREAD):
            for node in nodes:
                (rc, out) = ssa_tools_utils.execute_on_remote(cmd, node)
                if int(rc) != 0:
                    status = 1

    return status
Exemple #13
0
def check_error(global_dict):
    status = 0
    for osm_node in global_dict.get('core_nodes', 'osm'):
        ssa_tools_utils.execute_on_remote(
            'cat %s | egrep "ERROR|ERR:"' %
            (ssa_tools_utils.CFG_FILES['osm_logfile']), osm_node)

    errors = {}
    for typ in ['distrib', 'access', 'acm']:
        for nodes in ssa_tools_utils.devide_list_chunks(
                global_dict['%s_nodes' % typ], ssa_tools_utils.MAX_THREAD):
            for node in nodes:
                if node == '':
                    continue

                (rc, out) = ssa_tools_utils.execute_on_remote(
                    'cat %s | egrep "ERROR|ERR:|BACKTRACE" | grep -v "ERROR 111"'
                    % (ssa_tools_utils.CFG_FILES['%s_logfile' % typ]), node)
                if out != "":
                    print '%s %s \n%s' % (typ, node, out)
                    errors[node] = out
                    status = 1
                (rc, out) = ssa_tools_utils.execute_on_remote(
                    '/usr/sbin/ibstat | grep -i active', node)
                out = out.rstrip('\n')
                if int(rc) != 0 or out == '':
                    print 'ERROR. Check ib modules on %s:%s' % (node, out)
                    #status = 2
                    #errors[node] = out
    if status == 0:
        print 'Report: No errors found in logs'
    else:
        print 'Report: Found errors found on'
        for h, e in errors.iteritems():
            if len(e) == 0:
                continue
            print '*************** %s ***********************' % h
            print e
    return status
Exemple #14
0
def kcache_ip_lookup(node, active_interface, addr_to_search, entry_type):

    status = 0
    (rc, out) = ssa_tools_utils.execute_on_remote(
        'ip neigh show dev %s %s' % (active_interface, addr_to_search), node)
    if len(out) == 0:
        print 'ERROR: ip %s not found in node %s cache' % (addr_to_search,
                                                           node)
        status = 2
        return status
    if out.split()[-1] != entry_type:
        print 'ERROR: ip %s node %s cache is not %s' % (addr_to_search, node,
                                                        entry_type)
        status = 2
    return status
Exemple #15
0
def node_port_state_change(node, state):
    net = read_ibnetdiscover()
    pprint(net)
    (rc, out) = ssa_tools_utils.execute_on_remote(
        '/usr/sbin/ibstat |grep Node |grep GUID', node)
    try:
        SW_GID = net[out.split()[-1]][0]
        SW_PORT = net[out.split()[-1]][1]
        o = commands.getoutput('ibportstate -G %s %s %s' %
                               (SW_GID, SW_PORT, state))
        print '[%s] ibportstate -G %s %s %s\n%s' % (
            time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, state, o)
        return "%s %s" % (SW_GID, SW_PORT)
    except:
        return None
Exemple #16
0
def get_system_info(node, dest_dir):

    commands = [
        'date', 'df -lh', 'ibssa -v', 'ibacm -v', 'uname -a', 'cat /etc/issue',
        '/usr/sbin/ibstat', 'opensm --version | grep OpenSM',
        'cat /etc/sysctl.conf | grep core_pattern'
    ]

    node_info_file = '%s/%s.info' % (dest_dir, node)
    f = open(node_info_file, 'w')

    for cmd in commands:
        (_, out) = ssa_tools_utils.execute_on_remote(cmd, node)
        f.write("#%s\n%s\n\n" % (cmd, out))

    f.close()

    return 0
Exemple #17
0
def test_0_0_0(ibmsnet):
    test_header = inspect.getframeinfo(inspect.currentframe()).function    
    phase = 'Reconnect Test'
    rch_global_dict['test_description'][test_header] = phase
    status = 0
    net = ssa_utils.read_ibnetdiscover()
    pprint(net)
    stop_time = time.time() + rch_global_dict['timeout']
    while ( stop_time - time.time() > 0):
        acm_to_disconnect = random.choice(rch_global_dict['acm_nodes'])
        delay = random.randint(0, rch_global_dict['max_interval'] )
        (rc, out) = ssa_tools_utils.execute_on_remote('/usr/sbin/ibstat|grep Node |grep GUID', acm_to_disconnect)
        print out
        print acm_to_disconnect
        print net
        print out.split()
        print 'Disconnecting %s %s ' % (acm_to_disconnect, net[out.split()[-1]])
        SW_GID = net[out.split()[-1]][0]
        SW_PORT = net[out.split()[-1]][1]

        time.sleep(delay)
        print 'Report: Reconnect node %s after %d ' % ( acm_to_disconnect, delay)
        o = commands.getoutput('/usr/sbin/ibportstate -G %s %s disable' % ( SW_GID, SW_PORT ))
        print '[%s] /usr/sbin/ibportstate -G %s %s disable\n%s' % ( time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, o )
        delay = random.randint(rch_global_dict['min_interval'], rch_global_dict['max_interval'] )
        time.sleep(delay)
        o = commands.getoutput('/usr/sbin/ibportstate -G %s %s enable' % ( SW_GID, SW_PORT ))
        print '[%s] /usr/sbin/ibportstate -G %s %s enable\n%s'  % ( time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, o )
    
    for cmd in ['%s/maintain.py -t %s --setup status > %s/ssa_status.log' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']) ,
                '%s/maintain.py -t %s -e > %s/ssa_errors.log' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']),
                '%s/logcollector.py -t %s  -o %s' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']) ]:
        print cmd
        o = commands.getoutput(cmd)

    o = commands.getoutput("cat  %s/ssa_errors.log" %  rch_global_dict['log_dir'])
    o_status = commands.getoutput("cat %s/ssa_status.log" %  rch_global_dict['log_dir'])
    if o.find('Found errors found on') >= 0 or o_status.find('STOP') >= 0:
        print 'There are errors in %s/ssa_errors.log' %  rch_global_dict['log_dir']
        status = 1
    
    print 'See logs in %s' % rch_global_dict['log_dir']
    test_report(test_header, phase, status)
    return status
Exemple #18
0
def get_distribution_tree(master_sm_node):
    t = {}
    timeout = 300
    start_time = time.time()
    while (time.time() - start_time <= timeout):
        print "%s>#cat %s |grep -v %s" % (
            master_sm_node, ssa_tools_utils.CFG_FILES['plugin_logfile'],
            time.strftime("%b"))
        (rc, out) = ssa_tools_utils.execute_on_remote(
            "cat %s |grep -v %s" %
            (ssa_tools_utils.CFG_FILES['plugin_logfile'], time.strftime("%b")),
            master_sm_node)
        tree = out.encode('ascii', 'ignore').split(
            'General SSA distribution tree info')[-1].split('\n')
        print '[%s] Unfiltered Distribution tree looks like\n%s' % (
            master_sm_node, tree)
        if len(tree) > 1:
            return tree
    return tree
Exemple #19
0
def node_port_bounce(node, delay=0):
    if node == commands.getoutput('hostname'):
        print 'Unable to execute port bounce on local node %s' % node
        return 1

    net = read_ibnetdiscover()
    (rc, out) = ssa_tools_utils.execute_on_remote(
        '/usr/sbin/ibstat |grep Node |grep GUID', node)

    SW_GID = net[out.split()[-1]][0]
    SW_PORT = net[out.split()[-1]][1]

    o = commands.getoutput('ibportstate -G %s %s disable' % (SW_GID, SW_PORT))
    print '[%s] ibportstate -G %s %s disable\n%s' % (
        time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, o)
    time.sleep(delay)
    o = commands.getoutput('ibportstate -G %s %s enable' % (SW_GID, SW_PORT))
    print '[%s] ibportstate -G %s %s enable\n%s' % (
        time.strftime("%b %d %H:%M:%S"), SW_GID, SW_PORT, o)
    return 0
Exemple #20
0
def ib_acme_query(addr_format, dest, src, additional_flags, node):

    str_to_exec = ib_acme + ' -f ' + addr_format + ' -d ' + str(dest) + \
                  ' -s ' + str(src) + ' ' + additional_flags

    return ssa_tools_utils.execute_on_remote(str_to_exec, node)
Exemple #21
0
def test_0_0_0(ibmsnet):
    test_header = inspect.getframeinfo(inspect.currentframe()).function
    phase = 'Reboot Test'
    rch_global_dict['test_description'][test_header] = phase
    status = 0
    nodes = []
    for typ in [
            'acm',
    ]:
        #for typ in [ 'core', 'distrib', 'access', 'acm' ]:
        for node in rch_global_dict['%s_nodes' % typ]:
            nodes.append({node: typ})
    random.shuffle(nodes)
    stop_time = time.time() + rch_global_dict['timeout']
    while (stop_time - time.time() > 0):
        n = random.choice(nodes)
        node = n.keys()[0]
        delay = random.randint(rch_global_dict['min_interval'],
                               rch_global_dict['max_interval'])
        print 'Restarting %s after %d s' % (str(n), delay)
        (s, o) = ssa_tools_utils.execute_on_remote('reboot', node)
        time.sleep(120)

        #Wait for boot
        r = rch_global_dict['boot_retries']
        print 'Waiting for %s to boot\n%s\n%s' % (node, s, o)
        (s, _) = ssa_tools_utils.execute_on_remote('uptime', node)
        while (s > 0):
            print 'Waiting for %s to boot\n%s\n%s' % (node, s, o)
            (s, _) = ssa_tools_utils.execute_on_remote('uptime', node)
            time.sleep(10)
            r = r - 1
            if (r <= 0):
                print 'ERROR %s still down' % node
                status = 1
                test_report(test_header, phase, status)
                return status

        #Start SSA
        s = ssa_tools_utils._ssa_action(node, 'start', n[node])

        #Start ib_acme
        (s, o) = ssa_tools_utils.execute_on_remote('pgrep ib_stress.sh', node)
        if s > 0:
            if ssa_tools_utils.start_ib_acme(
                    node, rch_global_dict['ib_acme_delay'],
                    rch_global_dict['ib_acme_num']) > 0:
                print 'ERROR to execute ib_acme on %s' % node
                status = 1
                break

        else:
            print 'ERROR ib_acme already running on %s' % node
            break

        delay = random.randint(rch_global_dict['min_interval'],
                               rch_global_dict['max_interval'])
        time.sleep(delay)

    for node in nodes:
        ssa_tools_utils.stop_ib_acme(node.keys())

    for cmd in [
            '%s/maintain.py -t %s --setup status > %s/ssa_status.log' %
        (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'],
         rch_global_dict['log_dir']),
            '%s/maintain.py -t %s -e > %s/ssa_errors.log' %
        (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'],
         rch_global_dict['log_dir']),
            '%s/logcollector.py -t %s  -o %s' %
        (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'],
         rch_global_dict['log_dir'])
    ]:
        print cmd
        o = commands.getoutput(cmd)


#    o = commands.getoutput("cat  %s/ssa_errors.log" %  rch_global_dict['log_dir'])
#    o_status = commands.getoutput("cat %s/ssa_status.log" %  rch_global_dict['log_dir'])
#    if o.find('Found errors found on') >= 0 or o_status.find('STOP') >= 0:
#        print 'There are errors in %s/ssa_errors.log' %  rch_global_dict['log_dir']
#        status = 1

    print 'See logs in %s' % rch_global_dict['log_dir']
    test_report(test_header, phase, status)

    return status
Exemple #22
0
nodes = ['ko0003','ko0006','ko0011','ko0013','ko0018','ko0026','ko0027','ko0028','ko0031','ko0033','ko0034','ko0036','ko0040','ko0043','ko0048','ko0050','ko0051','ko0053','ko0055','ko0057','ko0059','ko0060','ko0061','ko0063','ko0067','ko0069','ko0070','ko0074','ko0076','ko0079','ko0080','ko0082','ko0085','ko0087','ko0088','ko0090','ko0096','ko0098','ko0099','ko0101','ko0103','ko0107','ko0111','ko0114','ko0116','ko0125','ko0128','ko0129','ko0134','ko0141','ko0144','ko0145','ko0148','ko0149','ko0150','ko0152','ko0154','ko0156','ko0157','ko0158','ko0162','ko0164','ko0166','ko0168','ko0170','ko0174','ko0178','ko0181','ko0185','ko0190','ko0192','ko0195','ko0197','ko0200','ko0203','ko0205','ko0207','ko0209','ko0210','ko0211','ko0213','ko0214','ko0217','ko0218','ko0223','ko0228','ko0229','ko0231','ko0235','ko0237','ko0239','ko0242','ko0249','ko0250','ko0252','ko0253','ko0255','ko0258','ko0261','ko0265','ko0268','ko0272','ko0274','ko0275','ko0277','ko0278','ko0281','ko0282','ko0283','ko0285','ko0286','ko0288','ko0289','ko0291','ko0294','ko0295','ko0297','ko0298','ko0300','ko0302','ko0304','ko0305','ko0306','ko0307','ko0309','ko0315','ko0319','ko0320','ko0322','ko0323','ko0324','ko0327','ko0328','ko0331','ko0332','ko0333','ko0335','ko0337','ko0339','ko0347','ko0351','ko0355','ko0357','ko0358','ko0359','ko0362','ko0364','ko0365','ko0366','ko0367','ko0368','ko0369','ko0370','ko0371','ko0373','ko0379','ko0380','ko0382','ko0383','ko0387','ko0392','ko0395','ko0397','ko0399','ko0402','ko0406','ko0411','ko0413','ko0419','ko0422','ko0425','ko0430','ko0436','ko0440','ko0442','ko0443','ko0444','ko0445','ko0449','ko0451','ko0454','ko0458','ko0462','ko0470','ko0471','ko0474','ko0475','ko0478','ko0479','ko0482','ko0483','ko0485','ko0487','ko0489','ko0490','ko0492']

errors=[]
sum={'ko000000' : ['virtual_name', 'node_guid','sys_image_guid' ,'port_lid'] }

o = commands.getoutput('ssh lennyb@ko-ops "/proj/SSA/Mellanox/scripts/node_list -P -e ssa,ssauniversal"')
for n in o.split('\n'):
    l = n.split()
    if len(l) == 0:
        continue
    sum[l[0]] = []
    sum[l[0]].append(l[1])

for node in nodes:
    try:
            (_, o) = ssa_tools_utils.execute_on_remote("ibv_devinfo | egrep \"node_guid|sys_image_guid|port_lid\"|awk '{print $2}'", node)
            o = o.encode('ascii','ignore').rstrip('\n')
            if len(o) == 0:
                errors.append(node)
            else:
                sum[node].append(o.split('\n'))
    except:
        print 'Failed on %s' % node
        errors.append(node)    
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'ulimit -c unlimited\'>> /root/.bashrc"', node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_uverbs\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe rdma_cm\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_addr\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe rdma_ucm\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_uverbs\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo `hostname` mthca0 > /sys/class/infiniband/mthca0/node_desc"'','node)
Exemple #23
0
def check_setup(global_dict):
    report = {}
    status = 0

    for typ in ['core', 'acm', 'distrib', 'access']:
        for nodes in ssa_tools_utils.devide_list_chunks(
                global_dict['%s_nodes' % typ], ssa_tools_utils.MAX_THREAD):
            for node in nodes:
                if node == '':
                    continue
                c = eval('ssa_tools_utils.%s' % typ)(node)
                try:
                    rc = c.get_status()
                except:
                    rc = 1
                report[node] = [typ]
                if int(rc) == 0:
                    report[node].append('RUNNING')
                else:
                    report[node].append('STOPPED')
                    status = 1

                try:
                    if typ == 'acm':
                        (rc, version) = ssa_tools_utils.execute_on_remote(
                            'ibacm -v', node)
                        if rc != 0:
                            version = 'unknown'
                    else:
                        (_, version) = ssa_tools_utils.execute_on_remote(
                            'ibssa -v', node)
                    version = version.split()[-1]
                except:
                    version = 'unknown'
                try:
                    (rc, lid) = ssa_tools_utils.execute_on_remote(
                        "/usr/sbin/ibstat|egrep -a5 \"Act|Initializing\"|grep Base| awk '{print $NF}'",
                        node)
                except:
                    lid = 'None'
                report[node].append(lid.rstrip('\n').encode('ascii', 'ignore'))
                (rc, gid) = ssa_tools_utils.execute_on_remote(
                    "/usr/sbin/ibaddr|awk '{print $2}'", node)
                report[node].append(gid.rstrip('\n').encode('ascii', 'ignore'))
                report[node].append(version)

                (a, b) = ssa_tools_utils.execute_on_remote(
                    "/usr/sbin/ibstat|egrep -a5 \"Act|Initializing\"|grep Base| awk '{print $NF}';/usr/sbin/ibaddr|awk '{print $2}';ibssa -v;ibacm -v",
                    node)

    print "*************  check_setup ********************"
    print "node: [ssa_type, status, lid, gid, version]"
    sum = {}
    for n in sorted(report.keys()):
        print '%s: %s' % (n, str(report[n]))
        if not sum.has_key(report[n][0]):
            sum[report[n][0]] = 1
        else:
            sum[report[n][0]] += 1
    print 'Running %s summary' % global_dict['topology']
    pprint(sum)

    status_file = '/tmp/%s_%s_status.json' % (time.strftime("%Y%M%d_%H%M%S"),
                                              global_dict['topology'])
    f = open(status_file, 'w')
    json.dump(report, f)
    print "Saved under %s \n***********************************************" % status_file

    return status
Exemple #24
0
def ib_acme_get_counters(node):

    str_to_exec = ib_acme + ' -P '
    return ssa_tools_utils.execute_on_remote(str_to_exec, node)
Exemple #25
0
ssa_tools_utils.rm_exec('echo "%s" | sudo crontab -' % line, nodes)
#ssa_tools_utils.rm_exec('echo "" | sudo -u lennyb crontab -' ,  nodes)
#ssa_tools_utils.rm_exec("sed -i 's/tcsh/bash/g' /etc/passwd", nodes)
#ssa_tools_utils.rm_exec('-- sh -c "echo kernel.core_pattern=/tmp/core.%e.%p.%h.%t >> /etc/sysctl.conf"' , nodes)
#ssa_tools_utils.rm_exec('-- sh -c "echo \'StrictHostKeyChecking no\'>> /etc/ssh/ssh_config"', nodes)
#ssa_tools_utils.rm_exec('sysctl -p /etc/sysctl.conf', nodes)
sys.exit(1)


if len(sys.argv) == 2:
    print 'Execute on parallel'
    ssa_tools_utils.rm_exec(sys.argv[1:], nodes)
else:
    for node in nodes:
        try:
            (rc, o) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'StrictHostKeyChecking no\'>> /etc/ssh/ssh_config"', node)
            #(rc, o) = ssa_tools_utils.execute_on_remote('date', node)
            #(rc, o) = ssa_tools_utils.execute_on_remote(sys.argv[1:], node)
            print '[%s] %s' % (node, o)
        except:
            print 'Failed on %s' % node
            errors.append(node)

#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'ulimit -c unlimited\'>> /root/.bashrc"', node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_uverbs\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe rdma_cm\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_addr\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe rdma_ucm\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'modprobe ib_uverbs\'>> /etc/profile"'',' node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo `hostname` mthca0 > /sys/class/infiniband/mthca0/node_desc"'','node)
#    (rc',' out) = ssa_tools_utils.execute_on_remote('-- sh -c "echo \'echo `hostname` mthca0 > /sys/class/infiniband/mthca0/node_desc\'>>/etc/profile "'','node)
Exemple #26
0
    acms = [
        hostname,
    ]
    sample_gids = gids
    sample_lids = lids
else:
    sample_gids = random.sample(gids, min(len(gids), sample_size))
    sample_lids = random.sample(lids, min(len(lids), sample_size))

for node in acms:
    slid = data[node][LID]
    print '%s %s' % (node, slid)

for node in acms:
    if node == '': continue
    (_, sgid) = ssa_tools_utils.execute_on_remote(
        "/usr/sbin/ibaddr |awk '{print $2}'", node)
    slid = data[node][LID]

    (_, _) = ssa_tools_utils.execute_on_remote(
        '%s -f g -d %s -s %s -c -v' % (ib_acme, osmgid, sgid), node)
    (_, _) = ssa_tools_utils.execute_on_remote(
        '%s -f l -d %s -s %s -c -v' % (ib_acme, osmlid, slid), node)
    time.sleep(10)

    print 'Testing %s with %d GIDs' % (node, len(sample_gids))
    (rc, out0) = ssa_tools_utils.execute_on_remote('%s -P ' % ib_acme, node)
    print 'Before GID test\n', out0
    for gid in sample_gids:
        print '%s#  %s -f g -d %s -s %s -c -v' % (node, ib_acme, gid, sgid)
        (rc, out0) = ssa_tools_utils.execute_on_remote('%s -P ' % ib_acme,
                                                       node)