Beispiel #1
0
 def get_vector(self,line,labels,fids):
     """
     Create a vector from the line.
     """
     v = FAST_VECTOR()
     p = line.strip().split(ATTRIBUTE_DELIMITTER)
     # first element can be the label.
     startPos = 0
     if p[0].find(VALUE_DELIMITTER) == -1:
     	
         # This is the label.
         # In binary classification it has to be either 0 or 1.
         # In regression it has to be a float.
         # In multi class classification it has to be an integer
         # from 1 to N, where N is the number of classes.
         # It can also be a string for muti-class classification
         # indicating the class label.
         
         try:
         	# if the following line is successful, then we have
         	# a numeric label.
         	lbl = float(p[0])
         	# if the following condition is true, then that numeric
         	# value is an integer.
         	if int(lbl) == lbl:
         		lbl = int(lbl)
         except ValueError:
         	# The label is not numeric. It is a charachter string.
         	lbl = p[0].strip()
         
         # Set the value of the label.
         v.label = lbl
         labels[v.label] = 1
         startPos = 1
     for ele in p[startPos:]:
         fele = ele.split(VALUE_DELIMITTER)
         if len(fele) != 2:
             continue
         featvalStr = fele[-1]
         featidStr = VALUE_DELIMITTER.join(fele[:-1])
         if featidStr.isdigit():
             fid = int(featidStr)
         else:
             fid = featidStr.strip()
         fval = float(featvalStr)
         fids[fid] = 1
         v.add(fid,fval)
     return v
Beispiel #2
0
    def get_vector(self, line, labels, fids):
        """
        Create a vector from the line.
        """
        v = FAST_VECTOR()
        p = line.strip().split(ATTRIBUTE_DELIMITTER)
        # first element can be the label.
        startPos = 0
        if p[0].find(VALUE_DELIMITTER) == -1:

            # This is the label.
            # In binary classification it has to be either 0 or 1.
            # In regression it has to be a float.
            # In multi class classification it has to be an integer
            # from 1 to N, where N is the number of classes.
            # It can also be a string for muti-class classification
            # indicating the class label.

            try:
                # if the following line is successful, then we have
                # a numeric label.
                lbl = float(p[0])
                # if the following condition is true, then that numeric
                # value is an integer.
                if int(lbl) == lbl:
                    lbl = int(lbl)
            except ValueError:
                # The label is not numeric. It is a charachter string.
                lbl = p[0].strip()

            # Set the value of the label.
            v.label = lbl
            labels[v.label] = 1
            startPos = 1
        for ele in p[startPos:]:
            fele = ele.split(VALUE_DELIMITTER)
            featvalStr = fele[-1]
            featidStr = VALUE_DELIMITTER.join(fele[:-1])
            if featidStr.isdigit():
                fid = int(featidStr)
            else:
                fid = featidStr.strip()
            fval = float(featvalStr)
            fids[fid] = 1
            v.add(fid, fval)
        return v
Beispiel #3
0
def oneSidedUnderSampling(trainFileName, sampleFileName):
    """
    Performs one-sided undersampling for binary valued data in the trainFileName.
    Writes the sampled instances to the sampleFileName. We will first count the
    number of positive and negative instances and then determine the minority
    class. We will then select all instances from the minority class and one
    randomly selected instance from the majority class. Next, we will use 1-NN
    rule to classify the majority class. All misclassified instances will be 
    appeneded to the sample. The sample will then be written to the sampleFileName.
    """
    (labels, vects) = convertToArray(trainFileName)
    assert (len(labels) == len(vects))
    n = len(labels)
    d = vects[0].size
    print "Dimensionality of feature vectors = %d" % d
    # determining the minority class.
    posVects = []
    negVects = []
    for i in range(0, n):
        label = labels[i]
        vect = vects[i]
        if label == 1:
            posVects.append((label, vect))
        else:
            negVects.append((label, vect))
    print "No. of positive instances = %d" % len(posVects)
    print "No. of negative instances = %d" % len(negVects)
    majorityClass = minorityClass = 0
    if len(posVects) > len(negVects):
        majorityClass = 1
        minorityClass = -1
        majorityInstances = posVects
        minorityInstances = negVects
    else:
        majorityClass = -1
        minorityClass = 1
        majorityInstances = negVects
        minorityInstances = posVects
    print "Majority class is %d" % majorityClass
    print "Minority class is %d" % minorityClass
    # sample is appened to the list L of tuples of the format (label, vect).
    L = []

    # add all minority class instances to L and one instance from the majority class.
    L.extend(minorityInstances)
    centIndex = getCenter(majorityInstances)
    L.append(majorityInstances[centIndex])
    del majorityInstances[centIndex]
    print "Index of the majority class selected for the initial sample = %d" % centIndex
    #L.append(majorityInstances.pop())

    # Use 1-NN classification to classify majority class instances using L.
    count = 0
    stepSize = len(majorityInstances) / 10
    for (label, vect) in majorityInstances:
        count += 1
        if (count % stepSize) == 0:
            print "%d percent completed..." % int((10 * count) / stepSize)
        minDist = float('infinity')
        minLabel = 0
        for (lbl, v) in L:
            dist = numpy.linalg.norm(v - vect)
            if dist < minDist:
                minDist = dist
                minLabel = lbl
        #print minLabel, label
        if minLabel != label:
            L.append((label, vect))
    # write the selected sample to file.
    print "No. of instances in the sample = %d" % len(L)
    sampleFile = SEQUENTIAL_FILE_WRITER(sampleFileName)
    for (lbl, v) in L:
        vx = FAST_VECTOR()
        vx.createFromArray(lbl, v)
        sampleFile.writeVector(vx, WriteLabel=True)
    sampleFile.close()
    pass
Beispiel #4
0
def oneSidedUnderSampling(trainFileName, sampleFileName):
    """
    Performs one-sided undersampling for binary valued data in the trainFileName.
    Writes the sampled instances to the sampleFileName. We will first count the
    number of positive and negative instances and then determine the minority
    class. We will then select all instances from the minority class and one
    randomly selected instance from the majority class. Next, we will use 1-NN
    rule to classify the majority class. All misclassified instances will be 
    appeneded to the sample. The sample will then be written to the sampleFileName.
    """
    (labels, vects) = convertToArray(trainFileName)  
    assert(len(labels) == len(vects))
    n = len(labels)
    d = vects[0].size
    print "Dimensionality of feature vectors = %d" % d
    # determining the minority class.
    posVects = []
    negVects = []
    for i in range(0,n):
        label = labels[i]
        vect = vects[i]
        if label == 1:
            posVects.append((label, vect))
        else:
            negVects.append((label, vect))
    print "No. of positive instances = %d" % len(posVects)
    print "No. of negative instances = %d" % len(negVects)
    majorityClass = minorityClass = 0
    if len(posVects) > len(negVects):
        majorityClass = 1
        minorityClass = -1
        majorityInstances = posVects
        minorityInstances = negVects
    else:
        majorityClass = -1
        minorityClass = 1
        majorityInstances = negVects
        minorityInstances = posVects
    print "Majority class is %d" % majorityClass
    print "Minority class is %d" % minorityClass 
    # sample is appened to the list L of tuples of the format (label, vect).
    L = []
    
    # add all minority class instances to L and one instance from the majority class.
    L.extend(minorityInstances)
    centIndex = getCenter(majorityInstances)
    L.append(majorityInstances[centIndex])
    del majorityInstances[centIndex]
    print "Index of the majority class selected for the initial sample = %d" % centIndex
    #L.append(majorityInstances.pop())
   
    # Use 1-NN classification to classify majority class instances using L.
    count = 0
    stepSize = len(majorityInstances) / 10
    for (label, vect) in majorityInstances:
        count += 1
        if (count % stepSize) == 0:
            print "%d percent completed..." % int((10 * count) / stepSize )
        minDist = float('infinity')
        minLabel = 0
        for (lbl, v) in L:
            dist = numpy.linalg.norm(v - vect)
            if dist < minDist:
                minDist = dist
                minLabel = lbl
        #print minLabel, label
        if minLabel != label:
            L.append((label, vect))    
    # write the selected sample to file.
    print "No. of instances in the sample = %d" % len(L)  
    sampleFile = SEQUENTIAL_FILE_WRITER(sampleFileName)
    for (lbl, v) in L:
        vx = FAST_VECTOR()
        vx.createFromArray(lbl, v)
        sampleFile.writeVector(vx, WriteLabel=True)
    sampleFile.close()
    pass