def undersampling(x, y, ratio=15, xlab=None, xtest=None, ytest=None, ylab=None, lab=None, prefix='', stype="under"): # 'Random under-sampling' parameters = {'random_state': 12345, 'max_features': None, 'oob_score': True} # , 'class_weight': 'balanced'} c = Classifier('rf', parameters, False) xtrain = x[:, :-1] # undersampler US = UnderSampler(ratio=ratio, verbose=False, random_state=12345) usx, usy = US.fit_transform(x, y) if not xlab: xlab = usx[:, -1] usx = usx[:, :-1] seuil = threshold_tuning(usx, usy, xlab, title=prefix + 'undersample') s2 = threshold_tuning(usx, usy, xlab, metric=roc_auc_score, title=prefix + 'undersample_roc') s2 = threshold_tuning(usx, usy, xlab, metric=accuracy_score, title=prefix + 'undersample_acc') tree = estimator_tree_tuning( usx, usy, pvalidator=xlab, title=prefix + 'OverSampler_trees') if xtest is not None: c.train(usx, usy) print('====> undersample') print("------------Result on yeast") test_and_print(xtest, ytest, ylab, c, seuil) print("\n------------Result on metazoa") test_and_print(xtrain, y, lab, c, seuil) return c
def read_subpop_data(one_hot=True, fake_data=False, test_size=0.2, undersample=False): labeled_dic = convert_txt_to_npy(LABELED_RL_PATH) unlabeled_dic = convert_txt_to_npy(UNLABELED_RL_PATH, labeled=False) X_train, X_test, y_train, y_test = split_train_test(labeled_dic, test_size=test_size) class DataSets(object): pass data_sets = DataSets() if undersample: from unbalanced_dataset import UnderSampler US = UnderSampler(verbose=True) X_train, y_train = US.fit_transform(X_train, y_train) lda = LDA() lda.fit(X_train, y_train) score = metrics.accuracy_score(lda.predict(X_test), y_test) print("Baseline LDA: %f " % score) if one_hot: y_train = convert_to_one_hot(y_train) y_test = convert_to_one_hot(y_test) data_sets = DataSets() data_sets.test = DataSet(X_test, y_test) data_sets.train = SemiDataSet(unlabeled_dic['data'], X_train, y_train) return data_sets
def downsample(d, response, random_state=None, preserve_index=False, verbose=True): """ Downsample data frame :param d: Data frame to be downsampled :param response: Field within data frame to use for downsampling (must contain only two unique values; an error will be thrown otherwise) :param random_state: Random state to use for downsampling :param preserve_index: Determines whether or not the index associated with the given data frame will be reattached to the downsampled result; if true, the names of all index fields must be non-null :param verbose: Flag indicating whether or not summaries of class frequencies should be printed :return: Data frame idential to "d" with some rows removed, and the values in "response" occurring with an equal frequency """ from unbalanced_dataset import UnderSampler sampler = UnderSampler(random_state=random_state, replacement=False, verbose=verbose) idx = None # If index preservation is requested, store the index field names before reseting it # on the input data frame (and make sure none of the names are null) if preserve_index: assert not np.any(pd.isnull(d.index.names)), \ 'When downsampling with "preserve_index=True", index field names must all be non-null. ' \ 'At least one name was null for the given index. Index names given: {}'.format(d.index.names) idx = list(d.index.names) d = d.reset_index() # Capture original data frame types and column names dtypes = d.dtypes.to_dict() cols = d.columns # Ensure that the field to be used for downsampling is present assert response in cols, \ 'Given response to use for downsampling "{}" was not found in dataset to be downsampled'.format(response) # Downsample dataset (as numpy arrays) ds, _ = sampler.fit_transform(d.values, d[response].values) # Re-conform resampled frame to original (add cols + index) d = pd.DataFrame(ds, columns=cols) for c in d: d[c] = d[c].astype(dtypes[c]) if preserve_index: d = d.set_index(idx) # Return result return d
def test_rest(x, y): print('Random under-sampling') US = UnderSampler(verbose=verbose) usx, usy = US.fit_transform(x, y) print('Tomek links') TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) print('Clustering centroids') CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) print('NearMiss-1') NM1 = NearMiss(version=1, verbose=verbose) nm1x, nm1y = NM1.fit_transform(x, y) print('NearMiss-2') NM2 = NearMiss(version=2, verbose=verbose) nm2x, nm2y = NM2.fit_transform(x, y) print('NearMiss-3') NM3 = NearMiss(version=3, verbose=verbose) nm3x, nm3y = NM3.fit_transform(x, y) print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(verbose=verbose) ncrx, ncry = NCR.fit_transform(x, y) print('Random over-sampling') OS = OverSampler(verbose=verbose) ox, oy = OS.fit_transform(x, y) print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose) stkx, stky = STK.fit_transform(x, y) print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose) sennx, senny = SENN.fit_transform(x, y) print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(x, y)
start = time.time() Xtrain, Ytrain = GetXY(tableTrain) end = time.time() print "Get Train XY Over: ", end - start # model = LogisticRegression() # model = RandomForestClassifier(n_estimators=200) model = GradientBoostingClassifier(n_estimators=300) # model = AdaBoostClassifier() start = time.time() US = UnderSampler(ratio=8.) # US = ClusterCentroids(ratio=5.) Xtrain1, Ytrain1 = US.fit_transform(Xtrain, Ytrain) end = time.time() print "Data decimation time: ", end - start start = time.time() model.fit(Xtrain1, Ytrain1) joblib.dump(model, modelFilePath + modelFileName) end = time.time() print "model train time: ", end - start # print metrics.classification_report(model.predict(Xtrain), Ytrain) pYtrain = model.predict_proba(Xtrain)[:, 1] pYtrain = map(lambda x: 1 if x > 0.4 else 0, pYtrain) submitNum = sum(pYtrain) allPosNum = sum(Ytrain) Yzip = zip(Ytrain, pYtrain) TPNum = Yzip.count((1, 1))
import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from unbalanced_dataset import UnderSampler from mpl_toolkits.mplot3d import Axes3D X_reduced=pickle.load(open(sys.argv[1], "rb")) fileName = sys.argv[2] X, Y = datasets.load_data(fileName) # Generate the new dataset using under-sampling method verbose = False # 'Random under-sampling' # ratio of majority elements to sample with respect to the number of minority cases. US = UnderSampler(ratio=1.,verbose=verbose) X_reduced, Y = US.fit_transform(X_reduced, Y) # To getter a better understanding of interaction of the dimensions # plot the first three tsne dimensions fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y, cmap=plt.cm.Paired) ax.set_title("First three tsne directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([]) outFile=sys.argv[3]#"pic/tsne_3_t" fig.savefig(outFile)
X_reduced = pca.fit_transform(X) # plt.figure(1, figsize=(4, 3)) # plt.clf() # plt.axes([.2, .2, .7, .7]) # plt.plot(pca.explained_variance_, linewidth=2) # plt.axis('tight') # plt.xlabel('n_components') # plt.ylabel('explained_variance_') # Generate the new dataset using under-sampling method verbose = False # 'Random under-sampling' # ratio of majority elements to sample with respect to the number of minority cases. US = UnderSampler(ratio=1., verbose=verbose) X_reduced, Y = US.fit_transform(X_reduced, Y) ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y, cmap=plt.cm.Paired) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([]) # outFile3D=sys.argv[2] # fig.savefig(outFile3D)
def under_sampling(self): US = UnderSampler(verbose=self.verbose) usx, usy = US.fit_transform(self.x, self.y) print "Under Sampling Transformed" return usx, usy
'mem_requested', 'disk', 'violation'] tain_path = r'/home/askrey/Dropbox/Project_step_by_step/3_create_database/csvs/frull_db_2.csv' X = pd.read_csv(tain_path, header = None, index_col = False ,names = colnames, skiprows = [0], usecols = [3,4,5,6,7]) y = pd.read_csv(tain_path, header = None, index_col = False ,names = colnames, skiprows = [0], usecols = [8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False # 'Random under-sampling' US = UnderSampler(verbose=verbose) x, y = US.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test) y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:,1] prediction, bias, contributions = ti.predict(clf, X_test)
# Plot the original data # Plot the two classes plt.scatter(x_vis[y==0, 0], x_vis[y==0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor='red', linewidth=0.15) plt.scatter(x_vis[y==1, 0], x_vis[y==1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor='blue', linewidth=0.15) plt.legend() plt.show() # Generate the new dataset using under-sampling method verbose = False # 'Random under-sampling' US = UnderSampler(verbose=verbose) usx, usy = US.fit_transform(x, y) # 'Tomek links' TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) # 'Clustering centroids' CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) # 'NearMiss-1' NM1 = NearMiss(version=1, verbose=verbose) nm1x, nm1y = NM1.fit_transform(x, y) # 'NearMiss-2' NM2 = NearMiss(version=2, verbose=verbose) nm2x, nm2y = NM2.fit_transform(x, y) # 'NearMiss-3' NM3 = NearMiss(version=3, verbose=verbose) nm3x, nm3y = NM3.fit_transform(x, y)
start = time.time() Xtrain, Ytrain = GetXY(tableTrain) end = time.time() print "Get Train XY Over: ", end - start # model = LogisticRegression() # model = RandomForestClassifier(n_estimators=200) model = GradientBoostingClassifier(n_estimators=300) # model = AdaBoostClassifier() start = time.time() US = UnderSampler(ratio=8.) # US = ClusterCentroids(ratio=5.) Xtrain1, Ytrain1 = US.fit_transform(Xtrain, Ytrain) end = time.time() print "Data decimation time: ", end - start start = time.time() model.fit(Xtrain1, Ytrain1) joblib.dump(model, modelFilePath + modelFileName) end = time.time() print "model train time: ", end - start # print metrics.classification_report(model.predict(Xtrain), Ytrain) pYtrain = model.predict_proba(Xtrain)[:, 1] pYtrain = map(lambda x: 1 if x > 0.4 else 0, pYtrain) submitNum = sum(pYtrain) allPosNum = sum(Ytrain) Yzip = zip(Ytrain, pYtrain) TPNum = Yzip.count((1, 1))