Beispiel #1
0
    def create_newdata(header, x):
        dlist = []
        cnt = 0
        tmp = 0
        hlist = []

        if (V):
            print('Reducing attributes by ' + str(options['turfpct']) + '%')
            sys.stdout.flush()

        for a in table:
            if (cnt >= keepcnt):
                lost[a[0]] = iteration + 1
                hlist.append(a[0])  # append lost attribe names to hlist
                i = header.index(a[0])
                dlist.append(i)
            cnt += 1

        header = np.delete(header, dlist).tolist()  #remove orphans from header
        x = np.delete(x, dlist, axis=1)  #remove orphaned attributes from data
        x = np.ascontiguousarray(x, dtype=np.double)

        if (V):
            print('Getting new variables, attributes and distance array')
            sys.stdout.flush()

        var = cmn.getVariables(header, x, y, options)
        attr = cmn.getAttributeInfo(header, x, var, options)

        if (V):
            print("---------------  Parameters  ---------------")
            print("datatype:   " + var['dataType'])
            print("attributes: " + str(var['NumAttributes']))

            if (var['dataType'] == 'mixed'):
                print("    continuous: " + str(var['cpct'][1]))
                print("    discrete:   " + str(var['dpct'][1]))
            if (var['mdcnt'] > 0):
                print("missing:    " + str(var['mdcnt']))
            print("--------------------------------------------")
            sys.stdout.flush()

        begin = tm.time()
        diffs, cidx, didx = cmn.dtypeArray(header, attr, var)
        if (var['mdcnt'] > 0):
            import mmDistance as md
            distArray = md.getDistances(x[:, cidx], x[:, didx], var,
                                        diffs[cidx])
            disttype = "missing"
        else:
            distArray = cmn.getDistances(x, attr, var, cidx, didx)
            disttype = "discrete/continuous/mixed"

        if (V):
            print(disttype + " distance array elapsed time(sec) = " +
                  str(tm.time() - begin))
            sys.stdout.flush()

        return header, x, attr, var, distArray, lost
    if (var['classType'] == 'multiclass'):
        yset = var['phenoTypeList']
        print("  classes:  " + str(len(yset)))
    print("classname:  " + var['phenoTypeName'])
    print("algorithm:  " + options['algorithm'])
    print("--------------------------------------------")
    sys.stdout.flush()
#-----------------------------------------------------------------------------#
# create distance array and remove intermediate data
# if missing and/or mixed data use the mixedDistance function
#
begin = tm.time()
diffs, cidx, didx = cmn.dtypeArray(header, attr, var)
if (var['mdcnt'] > 0):
    import mmDistance as md
    distArray = md.getDistances(x[:, cidx], x[:, didx], var, diffs[cidx])
    disttype = "missing"
else:
    distArray = cmn.getDistances(x, attr, var, cidx, didx, cheader)
    disttype = "discrete/continuous/mixed"
if (V):
    ctime = "[" + tm.strftime("%H:%M:%S") + "]"
    print(ctime + " " + disttype + " distance array time(sec) = " +
          str(tm.time() - begin))
    sys.stdout.flush()
#############################################################################


###################################################################################################################################################
def test_relieff_Multiplexer():
    """ Test ReliefF on 6-bit Multiplexer"""
Beispiel #3
0
        yset = var['phenoTypeList']
        print("  classes:  " + str(len(yset)))
    print("datatype:  " + var['dataType'])
    print("classname:  " + var['phenoTypeName'])
    print("algorithm:  " + options['algorithm'])
    print("--------------------------------------------")
    sys.stdout.flush()
#-----------------------------------------------------------------------------#
# create distance array and remove intermediate data
# if missing and/or mixed data use the mixedDistance function
#
begin = tm.time()
diffs, cidx, didx = cmn.dtypeArray(header, attr, var)
if(var['mdcnt'] > 0):
    import mmDistance as md
    distArray = md.getDistances(x[:,cidx], x[:,didx], var, diffs[cidx])
    disttype = "missing"
else:
    distArray = cmn.getDistances(x, attr, var, cidx, didx, cheader)
    disttype = "discrete/continuous/mixed"
if(V):
    ctime = "[" + tm.strftime("%H:%M:%S") + "]"
    print(ctime + " " + disttype + " distance array time(sec) = " 
                + str(tm.time()-begin))
    sys.stdout.flush()

#-----------------------------------------------------------------------------#
# get Scores based on algorithm selected (-a and -t)
#
if(turfpct > 0):  # Use TURF
    import Turf as T
Beispiel #4
0
    def create_newdata(header, x):
        dlist = []
        cnt = 0

        if(V):
            print('Reducing attributes by ' + str(options['turfpct']) + '%')
            sys.stdout.flush()

        #Go through table with feature sorted by decreasing scores, once we hit keepcnt, we start adding to lost. 
        for a in table:
            if(cnt >= keepcnt):
                lost[a[0]] = iteration + 1
                i = header.index(a[0])
                dlist.append(i) #store position of each feature removed in dlist. 
            cnt += 1
        
        #update header and dataset to reflect removal of lowest scoring features. 
        header = np.delete(header,dlist).tolist() #remove orphans from header
        x = np.delete(x,dlist,axis=1) #remove orphaned attributes from data
        x = np.ascontiguousarray(x, dtype=np.double)

        if(V):
            print('Getting new variables, attributes and distance array')
            sys.stdout.flush()
        
        #Redo data survey (which may save time in downstream distance array calculation (depending on dataset)
        var = cmn.getVariables(header, x, y, options)
        attr = cmn.getAttributeInfo(header, x, var, options)

        cheader = []
        for i in header:
            if attr[i][0] == 'continuous':
                cheader.append(i)  
                
        if(V):
            print("---------------  Parameters  ---------------")
            print("datatype:   " + var['dataType'])
            print("attributes: " + str(var['NumAttributes']))

            if(var['dataType'] == 'mixed'):
                print("    continuous: " + str(var['cpct'][1]))
                print("    discrete:   " + str(var['dpct'][1]))
            if(var['mdcnt'] > 0):
                print("missing:    " + str(var['mdcnt']))
            print("--------------------------------------------")
            sys.stdout.flush()

        begin = tm.time()
        diffs, cidx, didx = cmn.dtypeArray(header, attr, var)
        #Calculate distance array based on present feature types and data missingness.
        if(var['mdcnt'] > 0):
            import mmDistance as md
            distArray = md.getDistances(x[:,cidx], x[:,didx], var, diffs[cidx])
            disttype = "missing"
        else:
            distArray = cmn.getDistances(x, attr, var, cidx, didx, cheader)
            disttype = "discrete/continuous/mixed"

        if(V):
            print(disttype + " distance array elapsed time(sec) = " 
                    + str(tm.time()-begin))
            sys.stdout.flush()

        return header, x, attr, var, distArray, lost