def averageTrainTest(): datasetFile = 'data/source-code-metrics_train.csv' labelsFile = 'data/bugs_train.csv' data = pd.read_csv(datasetFile, ';') #separate at semicolon instead of comma labels = pd.read_csv(labelsFile, ';') data.set_index('classid',inplace=True) labels.set_index('classid',inplace=True) """ Section: SMOTE for class balance """ from unbalanced_dataset import SMOTE #, TomekLinks columns = list(data) smote = SMOTE(ratio=3, verbose=False, kind='regular') smox,smoy = smote.fit_transform(data.as_matrix(),labels.as_matrix().ravel()) data = pd.DataFrame(smox, columns=columns) labels = pd.DataFrame(smoy, columns=['bugs']) """ Section: outlier detection """ from myOutlierDetection import interquantileRange interquantileRange(data, perFeature = False) data = [trainandtest(data, labels) for _ in range(500)] return (sum([data[i][0] for i in range(len(data))])/len(data),sum([data[i][1] for i in range(len(data))])/len(data))
def runCrossValidation(runSMOTE = True, runIQR = True): datasetFile = 'data/source-code-metrics_train.csv' labelsFile = 'data/bugs_train.csv' data = pd.read_csv(datasetFile, ';') #separate at semicolon instead of comma labels = pd.read_csv(labelsFile, ';') data.set_index('classid',inplace=True) labels.set_index('classid',inplace=True) if runSMOTE: """ Section: SMOTE for class balance """ from unbalanced_dataset import SMOTE #, TomekLinks columns = list(data) smote = SMOTE(ratio=3, verbose=False, kind='regular') smox,smoy = smote.fit_transform(data.as_matrix(),labels.as_matrix().ravel()) data = pd.DataFrame(smox, columns=columns) labels = pd.DataFrame(smoy, columns=['bugs']) if runIQR: """ Section: outlier detection """ from myOutlierDetection import interquantileRange interquantileRange(data, perFeature = False) return crossvalidate(data.as_matrix(), labels.as_matrix().ravel())
def test_smote(x, y): print('SMOTE') sm = SMOTE(kind='regular', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE bordeline 1') sm = SMOTE(kind='borderline1', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE bordeline 2') sm = SMOTE(kind='borderline2', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE SVM') svm_args={'class_weight': 'auto'} sm = SMOTE(kind='svm', verbose=verbose, **svm_args) svmx, svmy = sm.fit_transform(x, y)
def sampling(): verbose = False y = np.bincount(target_train1) print y ratio = float(y[2]) / float(y[1]) # 'Random over-sampling' OS = OverSampler(ratio=ratio, verbose=verbose) osx, osy = OS.fit_transform(data_train1, target_train1) random_methods(osx,osy) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') smox, smoy = smote.fit_transform(data_train1, target_train1) random_methods(smox,smoy) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') bs1x, bs1y = bsmote1.fit_transform(data_train, target_train) random_methods(bs1x,bs1y) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') bs2x, bs2y = bsmote2.fit_transform(data_train1, target_train1) random_methods(bs2x,bs2y) # 'SMOTE SVM' svm_args={'class_weight' : 'auto'} svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args) svsx, svsy = svmsmote.fit_transform(data_train1, target_train1) random_methods(svsx,svsy) # 'SMOTE Tomek links' STK = SMOTETomek(ratio=ratio, verbose=verbose) stkx, stky = STK.fit_transform(data_train1, target_train1) random_methods(stkx,stky) # 'SMOTE ENN' SENN = SMOTEENN(ratio=ratio, verbose=verbose) ennx, enny = SENN.fit_transform(data_train1, target_train1) random_methods(ennx,enny) # 'EasyEnsemble' EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(data_train1, target_train1) random_methods(eex,eey) # 'BalanceCascade' BS = BalanceCascade(verbose=verbose) bsx, bsy = BS.fit_transform(data_train1, target_train1) random_methods(bsx,bsy)
def resample(self, X, y, t, fold): if not self.resample_method: return X, y else: start = time.time() if self.verbose: ptf('> Resampling for timestep %d, fold %d' % (t, fold), self.logfile) # create resampler if self.resample_method == 'under': print 'UNDER SAMPLING is not implemented yet' return X, y elif self.resample_method == 'over': if self.oversample_method.lower() == 'smote': resampler = SMOTE(**self.oversample_arguments) else: print 'Your resampling method is not implemented yet' return X, y print type(X), type(y) print X.shape, y[0].shape Xsmote, ysmote = resampler.fit_transform(X, y[0]) # resample ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold) # ysmote_df = self.build_smoted_label_df(ysmote, y, fold) # # find new folds # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y) if self.debug: print np.sum(y[0] == 0), np.sum(ysmote == 0) print np.sum(y[0] == 1), np.sum(ysmote == 1) if self.on_disk: self.pickle_time_step(ysmote_tuple, 'trigger_resample_labels', fold=fold, t=t) self.pickle_time_step(Xsmote, 'trigger_resample_features', fold=fold, t=t) else: self.trigger_resample_labels[fold][t] = ysmote_tuple self.trigger_resample_features[fold][t] = Xsmote end = time.time() if self.verbose: ptf('... %d s' % (end - start), self.logfile) return Xsmote, ysmote_tuple
def CrossValidateSMOTE(data, labels, clf, folds=10, runSMOTE=True): from unbalanced_dataset import SMOTE from sklearn.metrics import confusion_matrix as confmat from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score columns = [] if type(data) is not np.ndarray: data = data.as_matrix() if type(labels) is not np.ndarray: labels = labels.as_matrix().ravel() skf = StratifiedKFold(labels,n_folds=folds, shuffle=False) sets = [{'train':train, 'test':test} for train, test in skf] acc = [] fmeasure = [] recall = [] precision = [] cm = np.array([0, 0, 0, 0]).reshape(2,2) for fold in sets: data_train = data[fold['train']] labels_train = labels[fold['train']] bugs = sum(labels_train) ratio = float(len(labels_train)-bugs)/bugs data_test = data[fold['test']] labels_test = labels[fold['test']] if runSMOTE: smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1') data_train, labels_train = smote.fit_transform(data_train,labels_train) clf.fit(data_train, labels_train) hypot = clf.predict(data_test) acc.append(accuracy_score(hypot, labels_test)) fmeasure.append(f1_score(hypot, labels_test)) recall.append(recall_score(hypot, labels_test)) precision.append(precision_score(hypot, labels_test)) cm += confmat(labels_test, hypot) return acc, fmeasure, recall, precision, cm
def resample(self, X, y, t, fold): if not self.resample_method: return X, y else: start = time.time() if self.verbose: ptf('> Resampling for timestep %d, fold %d' % (t, fold), self.logfile) # create resampler if self.resample_method == 'under': print 'UNDER SAMPLING is not implemented yet' return X, y elif self.resample_method == 'over': if self.oversample_method.lower() == 'smote': resampler = SMOTE(**self.oversample_arguments) else: print 'Your resampling method is not implemented yet' return X, y print type(X), type(y) print X.shape, y[0].shape Xsmote, ysmote = resampler.fit_transform(X, y[0]) # resample ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold) # ysmote_df = self.build_smoted_label_df(ysmote, y, fold) # # find new folds # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y) if self.debug: print np.sum(y[0]==0), np.sum(ysmote == 0) print np.sum(y[0]==1), np.sum(ysmote == 1) if self.on_disk: self.pickle_time_step(ysmote_tuple, 'trigger_resample_labels', fold=fold, t=t) self.pickle_time_step(Xsmote, 'trigger_resample_features', fold=fold, t=t) else: self.trigger_resample_labels[fold][t] = ysmote_tuple self.trigger_resample_features[fold][t] = Xsmote end = time.time() if self.verbose: ptf('... %d s' % (end-start), self.logfile) return Xsmote, ysmote_tuple
def train_with_kmeans(X,Y,W): k = 2 c = KMeans(k) y = c.fit_predict(np.reshape(Y,[len(Y),1])) label,slabel = 0,1 m0,m1 = np.mean(Y[y == 0]),np.mean(Y[y == 1]) if m0 > m1: label = 1 idx = np.where(y == label)[0] Ys = Y[y == label] ys = c.fit_predict(np.reshape(Ys,[len(Ys),1])) m0,m1 = np.mean(Ys[ys == 0]),np.mean(Ys[ys == 1]) if m0 > m1: slabel = 0 y[idx[ys==slabel]] = abs(1-label) z,o = float(sum(y == 0)),float(sum(y == 1)) if z > o: r = z/o else: r = o/z smote = SMOTE(ratio=r/2, kind='regular') XS_train, yy_train = smote.fit_transform(X,y) s = AdaBoostClassifier(n_estimators=300) s.fit(XS_train,yy_train) #y_test_pred = s.predict(X_test) X_trains = [X[y == i] for i in range(k)] Y_trains = [Y[y == i] for i in range(k)] #X_tests = [X_test[y_test_pred == i] for i in range(k)] #Y_tests = [Y_test[y_test_pred == i] for i in range(k)] W_trains = [W[y == i] for i in range(k)] regressors = [] for i in range(k): regressor = LinearRegression() regressor.fit(X_trains[i],Y_trains[i],W_trains[i]) regressors.append(regressor) return s,regressors
def _sample_values(X, y, method=None, ratio=1, verbose=False): """Perform any kind of sampling(over and under). Parameters ---------- X : array, shape = [n_samples, n_features] Data. y : array, shape = [n_samples] Target. method : str, optional default: None Over or under smapling method. ratio: float Unbalanced class ratio. Returns ------- X, y : tuple Sampled X and y. """ if method == 'SMOTE': sampler = SMOTE(ratio=ratio, verbose=verbose) elif method == 'SMOTEENN': ratio = ratio * 0.3 sampler = SMOTEENN(ratio=ratio, verbose=verbose) elif method == 'random_over_sample': sampler = OverSampler(ratio=ratio, verbose=verbose) elif method == 'random_under_sample': sampler = UnderSampler(verbose=verbose) elif method == 'TomekLinks': sampler = TomekLinks(verbose=verbose) return sampler.fit_transform(X, y)
for f in args.read_train: npzfile = np.load(f) print "Read %d instances from %s" \ % (npzfile['feature_matrix'].shape[0], f.name) assert npzfile['targets'].size == npzfile['feature_matrix'].shape[0] tgt, fm = npzfile['targets'], npzfile['feature_matrix'] print "target size: ", tgt.shape print "positive examples: ", sum(sum(tgt)) tgt = tgt.reshape(tgt.size) if args.smote: ratio = float(np.count_nonzero(tgt == 0)) / \ float(np.count_nonzero(tgt == 1)) OS = SMOTE(ratio=ratio, kind='regular') fm, tgt = OS.fit_transform(fm, tgt) if targets is None: targets = tgt m = fm else: print "Before concat: ", targets.shape, tgt.shape targets = np.concatenate((targets, tgt), axis=0) m = np.concatenate((m, fm), axis=0) print "After concat: ", targets.shape, tgt.shape assert targets.size == m.shape[0] assert m.shape[0] == targets.shape[0] print "Sum of targets: ", sum(targets) print "Instances x features: ", m.shape
from sklearn import tree from sklearn.datasets import load_svmlight_file from sklearn import cross_validation from sklearn.metrics import confusion_matrix from sklearn import metrics import numpy as np from unbalanced_dataset import SMOTE #加载原始libsvm格式数据 data,label=load_svmlight_file("/home/hadoop/input/libsvm.data") #原始数据集合划分30%作为测试集 x_train,x_test,y_train,y_test=cross_validation.train_test_split(data,label,test_size=0.3,random_state=0) #训练数据调用smote算法 verbose = False ratio = float(np.count_nonzero(y_train==0)) / float(np.count_nonzero(y_train==1)) smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') smox, smoy = smote.fit_transform(x_train.toarray(), y_train) print np.count_nonzero(smoy==1) print np.count_nonzero(smoy==0) #使用决策树模型训练数据 clf=tree.DecisionTreeClassifier() clf=clf.fit(smox,smoy) #score=clf.score(x_test,y_test) #对测试数据预测 y_pred=clf.predict(x_test) print y_pred #模型评估 confusion=confusion_matrix(y_test,y_pred) print confusion accruacy = metrics.accuracy_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred)
x = numpy.delete(x, [3 + i * 11 for i in xrange(6)], axis=1) #print x[0] #x = normalize(x, axis=0) #x = scale(x, axis=0) y = df[:, -1] y = map(lambda x: int(x), y) def f6(x): if x == 6: return 2 elif x == 7: return 6 else: return x y = map(f6, y) y = numpy.array(y) sm = SMOTE(kind='regular') for i in xrange(10): x, y = sm.fit_transform(x, y) clf = RandomForestClassifier(n_estimators=100, class_weight='auto') pr = cross_validation.cross_val_predict(clf, x, y, cv=10) #clf.fit(x,y) print metrics.accuracy_score(y, pr) print metrics.confusion_matrix(y, pr) #joblib.dump(clf, 'rand_forest_model_1.pkl')
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) axes[2, 2].scatter(ncrx_vis[ncry==1, 0], ncrx_vis[ncry==1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) axes[2, 2].set_title('Neighboorhood cleaning rule', fontsize=fs) plt.show() # Generate the new dataset using under-sampling method verbose = False ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0)) # 'Random over-sampling' OS = OverSampler(ratio=ratio, verbose=verbose) osx, osy = OS.fit_transform(x, y) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') smox, smoy = smote.fit_transform(x, y) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') bs1x, bs1y = bsmote1.fit_transform(x, y) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') bs2x, bs2y = bsmote2.fit_transform(x, y) # 'SMOTE SVM' svm_args={'class_weight' : 'auto'} svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args) svsx, svsy = svmsmote.fit_transform(x, y) # 'SMOTE Tomek links' STK = SMOTETomek(ratio=ratio, verbose=verbose) stkx, stky = STK.fit_transform(x, y) # 'SMOTE ENN' SENN = SMOTEENN(ratio=ratio, verbose=verbose)
def smote_boderline2(self): bsmote2 = SMOTE(ratio=self._ratio, verbose=self.verbose, kind='borderline2') bs2x, bs2y = bsmote2.fit_transform(self.x, self.y) return bs2x, bs2y
def smote_svm(self): svm_args={'class_weight' : 'auto'} svmsmote = SMOTE(ratio=self._ratio, verbose=self.verbose, kind='svm', **svm_args) svsx, svsy = svmsmote.fit_transform(self.x, self.y) return svsx, svsy
def smote(self): smote = SMOTE(ratio=self._ratio, verbose=self.verbose, kind='regular') smox, smoy = smote.fit_transform(self.x, self.y) return smox, smoy
import numpy as np train=pd.read_csv('./cmv.csv') train['Defective']=train['Defective'].map({'Y':1,'N':0}) print type(train.values) train=train.values print train[0:1] X_r,err=manifold.locally_linear_embedding(train[:,0:-1],n_neighbors=12,n_components=4) print("Done. Reconstruction error: %g" % err) data=X_r label=train[:,-1] #print label x_train,x_test,y_train,y_test=cross_validation.train_test_split(data,label,test_size=0.3,random_state=0) verbose = False ratio = float(np.count_nonzero(y_train==0)) / float(np.count_nonzero(y_train==1)) smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') smox, smoy = smote.fit_transform(x_train, y_train) print np.count_nonzero(smoy==1) print np.count_nonzero(smoy==0) clf=svm.SVC(C=10000,gamma=0.0078125) #print y_train.astype(int) clf.fit(smox,smoy) y_pred=clf.predict(x_test) print y_test print y_pred confusion=confusion_matrix(y_test,y_pred) print confusion score = cross_val_score(clf, x_train, y_train) print score.mean() print score.std()
def f6(x): if x == 7: return 2 elif x > 7: return x - 1 else: return x #y = map(f6, y) y = numpy.array(y) sm = SMOTE(kind='regular') for i in xrange(20): x_metrics, y = sm.fit_transform(x_metrics, y) clf = RandomForestClassifier(n_estimators=100, class_weight='auto') pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10) #print metrics.accuracy_score(y, pr) #print metrics.confusion_matrix(y, pr) delete_rows_indexes = [i for i, y_i in enumerate(pr) if y_i == 2 and y[i] == 7] x_metrics = numpy.delete(x_metrics, delete_rows_indexes, axis=0) y = numpy.delete(y, delete_rows_indexes, axis=0) #clf.fit(x_metrics,y) #joblib.dump(clf, 'rand_forest_model_3.pkl') pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10) print metrics.accuracy_score(y, pr) #print metrics.confusion_matrix(y, pr)
from sklearn.preprocessing import StandardScaler X = df.ix[:, 1:10].values y = df["Class"] # In[467]: xn = StandardScaler().fit_transform(X) Xn = pd.DataFrame(xn, columns=df.ix[:, 1:10].columns) OS = SMOTE(ratio=0.85, verbose=True) osx, osy = OS.fit_transform(Xn.values, y) # In[468]: X_train, X_test, y_train, y_test = train_test_split(osx, osy, test_size=0.2, random_state=1) # In[469]: from sklearn.svm import SVC # In[470]: modelOS = SVC(kernel="linear", C=1).fit(osx, osy)
def fit(self, data, labels): """ Training (fitting) the meta-classifier requires training each individual classifier in the ensemble and using testing data that has not been used in the classifiers' training set to train the meta-classifier. To do this, we use 10-fold Stratified Cross-Validation to produce a training set for the meta-classifier equal to the one provided. Arguments --------- data: pandas (N,d) dataframe with data to be trained on labels: pandas (N,1) dataframe with labels for training data SMOTE: perform SMOTE as part of cross-validation to balance classes """ from unbalanced_dataset import SMOTE from sklearn.cross_validation import StratifiedKFold #if not isinstance(data, pd.DataFrame) or not isinstance(labels, pd.DataFrame): # print "data and labels must be in pandas DataFrame form" # raise TypeError N, d = data.shape self.data_train = np.copy(data)#.copy(deep=True) self.labels_train = np.copy(labels)#.copy(deep=True) #training data for metaclassifier (results of each classifier in ensemble) self.fusion_data = pd.DataFrame() #(fusion_labels = labels_train!) skf = StratifiedKFold(self.labels_train, n_folds=10) sets = [{'train':train, 'test':test} for train, test in skf] count = 0 for clf in self.ensemble: hypothesis = list() for fold in sets: #separate training/testing set for fold, use SMOTE if asked to data_train_fold = self.data_train[fold['train']] labels_train_fold = self.labels_train[fold['train']] if self.useSMOTE: bugs = sum(labels_train_fold) ratio = float(len(labels_train_fold)-bugs)/bugs smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1') data_train_fold, labels_train_fold = smote.fit_transform(data_train_fold,labels_train_fold) #data_train_fold = pd.DataFrame(data_train_fold, columns=categories) #labels_train_fold = pd.DataFrame(labels_train_fold, columns=['bugs']) #fit the classifier with the training data of current fold clf.fit(data_train_fold, labels_train_fold) #make a prediction with the testing data of current fold data_test_fold = self.data_train[fold['test']] y = clf.predict(data_test_fold) #store data for the meta-classifier hypothesis.extend(list(y)) #re-train the model using the entire available data (better performance) if self.useSMOTE: bugs = sum(self.labels_train) ratio = float(len(self.labels_train)-bugs)/bugs smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1') data_train_clf, labels_train_clf = smote.fit_transform(self.data_train, self.labels_train) #data_train_fold = pd.DataFrame(data_train_fold, columns=categories) #labels_train_fold = pd.DataFrame(labels_train_fold, columns=['bugs']) clf.fit(data_train_clf, labels_train_clf) #new column of metaclassifier training data (this classifier's hypothesis) self.fusion_data['classifier_'+str(count)] = np.array(hypothesis) count+=1 #perform smote on the fusion data to even out the classes if self.useSMOTE: columns = list(self.fusion_data) bugs = sum(self.labels_train) ratio = float(len(self.labels_train)-bugs)/bugs smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1') self.fusion_data, self.labels_train = smote.fit_transform(self.fusion_data.as_matrix(),self.labels_train) self.fusion_data = pd.DataFrame(self.fusion_data, columns=columns) #train the aggregator using the fusion set created earlier self.aggregator.fit(self.fusion_data.as_matrix(), self.labels_train) return
header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') x, y = bsmote2.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test) y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
#Ratio output #print(ratio) #2.33 #Set verbose as false to show less information verbose = False #Create SMOTE object #smote = SMOTE(ratio = ratio, verbose = False, kind = 'regular') #Don't use #Another way - leave this way smote = SMOTE(ratio = 1.335, verbose = False, kind = 'regular') #Fit data and transform X_mod = X.as_matrix() Y_mod = np.array(Y) #Create new dataset smox, smoy = smote.fit_transform(X_mod, Y_mod) #Check ratio of good and bad creditors #Convert matrix to dataframe y_data = pd.DataFrame(smoy, columns = ['classification']) #check work y_data['classification'].value_counts() #New visualizations #Convert matrix to dataframe to plot numeric columns #Create list of column names col_names = ['Status checking_A11', 'Status checking_A12', 'Status checking_A13', 'Status checking_A14', 'Credit history_A30', 'Credit history_A31', 'Credit history_A32', 'Credit history_A33', 'Credit history_A34', 'Purpose_A40', 'Purpose_A41', 'Purpose_A410', 'Purpose_A42',
def smote_boderline1(self): bsmote1 = SMOTE(ratio=self._ratio, verbose=self.verbose, kind='borderline1') bs1x, bs1y = bsmote1.fit_transform(self.x, self.y) return bs1x, bs1y
header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') x, y = smote.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test) y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
def f6(x): if x == 7: return 2 elif x > 7: return x - 1 else: return x #y = map(f6, y) y = numpy.array(y) sm = SMOTE(kind='regular') for i in xrange(20): x_metrics, y = sm.fit_transform(x_metrics, y) clf = RandomForestClassifier(n_estimators=100, class_weight='auto') pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10) #print metrics.accuracy_score(y, pr) #print metrics.confusion_matrix(y, pr) delete_rows_indexes = [i for i, y_i in enumerate(pr) if y_i == 2 and y[i] == 7] x_metrics = numpy.delete(x_metrics, delete_rows_indexes, axis=0) y = numpy.delete(y, delete_rows_indexes, axis=0) #clf.fit(x_metrics,y) #joblib.dump(clf, 'rand_forest_model_3.pkl') pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10)