Example #1
0
def oversample(args):
    print("+++ Reading input", file=sys.stderr)
    hashes, samples = read_input()

    print("+++ Converting to numpy array", file=sys.stderr)
    np_samples = np.array(samples, np.int32)

    print("+++ Performing SMOTE", file=sys.stderr)
    new_samples = smote.SMOTE(np_samples, args.smote_amount, args.neighbors)

    print("+++ Outputting new files", file=sys.stderr)
    output_new_files(hashes, new_samples, args)
Example #2
0
    def csv2py(self, filename, _smote=False, duplicate=False):
        "Convert a csv file to a model file"
        tbl = table(filename)
        if _smote:
            tbl = smote.SMOTE(tbl,
                              atleast=50,
                              atmost=101,
                              bugIndx=1,
                              resample=duplicate)
        self.str2num(tbl)
        tonum = lambda x: self.translate[x] if isinstance(x, str) else x
        """ There's a bug in table.py that doesn't separate dependent and independent
      Variable. The following, badly written, piece of code corrects for it...
  """
        for indx, k in enumerate(tbl.indep):
            for l in tbl.depen:
                if k.name == l.name:
                    tbl.indep.pop(indx)

        return self.data(indep=[i.name for i in tbl.indep],
                         less=[i.name for i in tbl.depen],
                         _rows=map(lambda x: [tonum(xx) for xx in x.cells],
                                   tbl._rows))
Example #3
0
    negTestLines = open("mohsen/80-20-hsa1/20_expression_neg.csv",
                        'r').readlines()[1:]

    posTraining = np.array([[float(y) for y in x.split(',')[:-1]]
                            for x in posTrainLines])
    negTraining = np.array([[float(y) for y in x.split(',')[:-1]]
                            for x in negTrainLines])
    posTest = np.array([[float(y) for y in x.split(',')[:-1]]
                        for x in posTestLines])
    negTest = np.array([[float(y) for y in x.split(',')[:-1]]
                        for x in negTestLines])

    pList = []

    # Use SMOTE to deal with class imbalance
    posTraining = smote.SMOTE(posTraining,
                              100 * (len(negTraining) / len(posTraining)), 5)
    # Build a single negative+positive training set
    trainingArray = np.concatenate((posTraining, negTraining))
    trainingClasses = np.array(['1'] * len(posTraining) +
                               ['0'] * len(negTraining))
    # Build a single negative+positive test set
    testArray = np.concatenate((posTest, negTest))
    testClasses = np.array(['1'] * len(posTest) + ['0'] * len(negTest))
    # Build classifier on training data
    rf = RandomForestClassifier(n_estimators=500)
    rf.fit(trainingArray, trainingClasses)

    # joblib.dump(rf, "smirpdeep_model.pkl")
    # Get results for test set
    predictions = rf.predict_proba(testArray)
    predictions = np.hstack((predictions, np.atleast_2d(testClasses).T))
Example #4
0
#{0.0: 10, 1.0: 417, 2.0: 7, 3.0: 9, 4.0: 106 }
'''
formula to fix no. of samples: no_of_samples_you_want = x * no. of neighbors / 100 
you have to provide x , x must be < 100 or multiple of 100  
'''
print "smoting time for level 0 "
## smoting time for level 0  
# get the records per sample 
classVal = float(0)
records_per_class_0 = smote_utility.getRecordsPeClass(classVal, the_data_set)
#print records_
array_shaped_record = np.array(records_per_class_0)
print "original datatset ", array_shaped_record.shape
count_extra_synthetic_samples = 3200  ## fix samples based on number of nerighbors 
nearest_nieghbors = 10 ### Expected n_neighbors <= n_samples, level 0 has 10 samples
smoted_dataset_0 = smote.SMOTE(array_shaped_record.shape, array_shaped_record, count_extra_synthetic_samples, nearest_nieghbors)
print "smoted dataset shape: level-0::", smoted_dataset_0.shape 
print "-----"


print "smoting time for level 1 "
## smoting time for level 1
# get the records per sample 
classVal = float(1)
records_per_class_1 = smote_utility.getRecordsPeClass(classVal, the_data_set)
#print records_
array_shaped_record = np.array(records_per_class_1)
print "original datatset ", array_shaped_record.shape
count_extra_synthetic_samples = 50
nearest_nieghbors = 10 ### Expected n_neighbors <= n_samples : level has 417 samples, going with 10 
smoted_dataset_2 = smote.SMOTE(array_shaped_record.shape, array_shaped_record, count_extra_synthetic_samples, nearest_nieghbors)