def oversample(args): print("+++ Reading input", file=sys.stderr) hashes, samples = read_input() print("+++ Converting to numpy array", file=sys.stderr) np_samples = np.array(samples, np.int32) print("+++ Performing SMOTE", file=sys.stderr) new_samples = smote.SMOTE(np_samples, args.smote_amount, args.neighbors) print("+++ Outputting new files", file=sys.stderr) output_new_files(hashes, new_samples, args)
def csv2py(self, filename, _smote=False, duplicate=False): "Convert a csv file to a model file" tbl = table(filename) if _smote: tbl = smote.SMOTE(tbl, atleast=50, atmost=101, bugIndx=1, resample=duplicate) self.str2num(tbl) tonum = lambda x: self.translate[x] if isinstance(x, str) else x """ There's a bug in table.py that doesn't separate dependent and independent Variable. The following, badly written, piece of code corrects for it... """ for indx, k in enumerate(tbl.indep): for l in tbl.depen: if k.name == l.name: tbl.indep.pop(indx) return self.data(indep=[i.name for i in tbl.indep], less=[i.name for i in tbl.depen], _rows=map(lambda x: [tonum(xx) for xx in x.cells], tbl._rows))
negTestLines = open("mohsen/80-20-hsa1/20_expression_neg.csv", 'r').readlines()[1:] posTraining = np.array([[float(y) for y in x.split(',')[:-1]] for x in posTrainLines]) negTraining = np.array([[float(y) for y in x.split(',')[:-1]] for x in negTrainLines]) posTest = np.array([[float(y) for y in x.split(',')[:-1]] for x in posTestLines]) negTest = np.array([[float(y) for y in x.split(',')[:-1]] for x in negTestLines]) pList = [] # Use SMOTE to deal with class imbalance posTraining = smote.SMOTE(posTraining, 100 * (len(negTraining) / len(posTraining)), 5) # Build a single negative+positive training set trainingArray = np.concatenate((posTraining, negTraining)) trainingClasses = np.array(['1'] * len(posTraining) + ['0'] * len(negTraining)) # Build a single negative+positive test set testArray = np.concatenate((posTest, negTest)) testClasses = np.array(['1'] * len(posTest) + ['0'] * len(negTest)) # Build classifier on training data rf = RandomForestClassifier(n_estimators=500) rf.fit(trainingArray, trainingClasses) # joblib.dump(rf, "smirpdeep_model.pkl") # Get results for test set predictions = rf.predict_proba(testArray) predictions = np.hstack((predictions, np.atleast_2d(testClasses).T))
#{0.0: 10, 1.0: 417, 2.0: 7, 3.0: 9, 4.0: 106 } ''' formula to fix no. of samples: no_of_samples_you_want = x * no. of neighbors / 100 you have to provide x , x must be < 100 or multiple of 100 ''' print "smoting time for level 0 " ## smoting time for level 0 # get the records per sample classVal = float(0) records_per_class_0 = smote_utility.getRecordsPeClass(classVal, the_data_set) #print records_ array_shaped_record = np.array(records_per_class_0) print "original datatset ", array_shaped_record.shape count_extra_synthetic_samples = 3200 ## fix samples based on number of nerighbors nearest_nieghbors = 10 ### Expected n_neighbors <= n_samples, level 0 has 10 samples smoted_dataset_0 = smote.SMOTE(array_shaped_record.shape, array_shaped_record, count_extra_synthetic_samples, nearest_nieghbors) print "smoted dataset shape: level-0::", smoted_dataset_0.shape print "-----" print "smoting time for level 1 " ## smoting time for level 1 # get the records per sample classVal = float(1) records_per_class_1 = smote_utility.getRecordsPeClass(classVal, the_data_set) #print records_ array_shaped_record = np.array(records_per_class_1) print "original datatset ", array_shaped_record.shape count_extra_synthetic_samples = 50 nearest_nieghbors = 10 ### Expected n_neighbors <= n_samples : level has 417 samples, going with 10 smoted_dataset_2 = smote.SMOTE(array_shaped_record.shape, array_shaped_record, count_extra_synthetic_samples, nearest_nieghbors)