def split(self, x_data, y_data): Xt, Yt, Xv, Yv = super(SMOTESplitter, self).split(x_data, y_data) Xt_smote, Yt_smote = SMOTE(**self._smote_params).fit_transform( Xt.as_matrix(), Yt.as_matrix()) Xt_smote, Yt_smote = UnderSampler( ratio=self._under_sample).fit_transform(Xt_smote, Yt_smote) return Xt_smote, Yt_smote, Xv, Yv
def read_subpop_data(one_hot=True, fake_data=False, test_size=0.2, undersample=False): labeled_dic = convert_txt_to_npy(LABELED_RL_PATH) unlabeled_dic = convert_txt_to_npy(UNLABELED_RL_PATH, labeled=False) X_train, X_test, y_train, y_test = split_train_test(labeled_dic, test_size=test_size) class DataSets(object): pass data_sets = DataSets() if undersample: from unbalanced_dataset import UnderSampler US = UnderSampler(verbose=True) X_train, y_train = US.fit_transform(X_train, y_train) lda = LDA() lda.fit(X_train, y_train) score = metrics.accuracy_score(lda.predict(X_test), y_test) print("Baseline LDA: %f " % score) if one_hot: y_train = convert_to_one_hot(y_train) y_test = convert_to_one_hot(y_test) data_sets = DataSets() data_sets.test = DataSet(X_test, y_test) data_sets.train = SemiDataSet(unlabeled_dic['data'], X_train, y_train) return data_sets
def split(self, x_data, y_data): Xt, Yt, Xv, Yv = super(OverUnderSplitter, self).split(x_data, y_data) Xt_smote, Yt_smote = OverSampler( ratio=self._over_sample).fit_transform(Xt.as_matrix(), Yt.as_matrix()) Xt_smote, Yt_smote = UnderSampler( ratio=self._under_sample).fit_transform(Xt_smote, Yt_smote) return Xt_smote, Yt_smote, Xv, Yv
def _parallel_build_trees(tree, forest, X, y): if forest.sampling is None: sampler = BootstrapSampler(random_state=tree.random_state) elif forest.sampling == 'up': sampler = OverSampler(random_state=tree.random_state, verbose=False) elif forest.sampling == 'down': sampler = UnderSampler(random_state=tree.random_state, verbose=False) X_sample, y_sample = sampler.fit_transform(X, y) tree.fit(X_sample, y_sample, check_input=False) return tree
def apply_sampling(X_data, Y_data, sampling, n_states, maxlen): ratio = float(np.count_nonzero(Y_data == 1)) / \ float(np.count_nonzero(Y_data == 0)) X_data = np.reshape(X_data, (len(X_data), n_states * maxlen)) # 'Random over-sampling' if sampling == 'OverSampler': OS = OverSampler(ratio=ratio, verbose=True) # 'Random under-sampling' elif sampling == 'UnderSampler': OS = UnderSampler(verbose=True) # 'Tomek under-sampling' elif sampling == 'TomekLinks': OS = TomekLinks(verbose=True) # Oversampling elif sampling == 'SMOTE': OS = SMOTE(ratio=1, verbose=True, kind='regular') # Oversampling - Undersampling elif sampling == 'SMOTETomek': OS = SMOTETomek(ratio=ratio, verbose=True) # Undersampling elif sampling == 'OneSidedSelection': OS = OneSidedSelection(verbose=True) # Undersampling elif sampling == 'CondensedNearestNeighbour': OS = CondensedNearestNeighbour(verbose=True) # Undersampling elif sampling == 'NearMiss': OS = NearMiss(version=1, verbose=True) # Undersampling elif sampling == 'NeighbourhoodCleaningRule': OS = NeighbourhoodCleaningRule(verbose=True) # ERROR: WRONG SAMPLER, TERMINATE else: print('Wrong sampling variable you have set... Exiting...') sys.exit() # print('shape ' + str(X.shape)) X_data, Y_data = OS.fit_transform(X_data, Y_data) return X_data, Y_data
def _sample_values(X, y, method=None, ratio=1, verbose=False): """Perform any kind of sampling(over and under). Parameters ---------- X : array, shape = [n_samples, n_features] Data. y : array, shape = [n_samples] Target. method : str, optional default: None Over or under smapling method. ratio: float Unbalanced class ratio. Returns ------- X, y : tuple Sampled X and y. """ if method == 'SMOTE': sampler = SMOTE(ratio=ratio, verbose=verbose) elif method == 'SMOTEENN': ratio = ratio * 0.3 sampler = SMOTEENN(ratio=ratio, verbose=verbose) elif method == 'random_over_sample': sampler = OverSampler(ratio=ratio, verbose=verbose) elif method == 'random_under_sample': sampler = UnderSampler(verbose=verbose) elif method == 'TomekLinks': sampler = TomekLinks(verbose=verbose) return sampler.fit_transform(X, y)
pca = PCA() X_reduced = pca.fit_transform(X) # plt.figure(1, figsize=(4, 3)) # plt.clf() # plt.axes([.2, .2, .7, .7]) # plt.plot(pca.explained_variance_, linewidth=2) # plt.axis('tight') # plt.xlabel('n_components') # plt.ylabel('explained_variance_') # Generate the new dataset using under-sampling method verbose = False # 'Random under-sampling' # ratio of majority elements to sample with respect to the number of minority cases. US = UnderSampler(ratio=1., verbose=verbose) X_reduced, Y = US.fit_transform(X_reduced, Y) ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y, cmap=plt.cm.Paired) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([]) # outFile3D=sys.argv[2]
colnames = ['old_index','job_id', 'task_idx','sched_cls', 'priority', 'cpu_requested', 'mem_requested', 'disk', 'violation'] tain_path = r'/home/askrey/Dropbox/Project_step_by_step/3_create_database/csvs/frull_db_2.csv' X = pd.read_csv(tain_path, header = None, index_col = False ,names = colnames, skiprows = [0], usecols = [3,4,5,6,7]) y = pd.read_csv(tain_path, header = None, index_col = False ,names = colnames, skiprows = [0], usecols = [8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False # 'Random under-sampling' US = UnderSampler(verbose=verbose) x, y = US.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test) y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:,1] prediction, bias, contributions = ti.predict(clf, X_test)
def main(argv): X=np.load('numdata/epochFeats.npy') Y=np.load('numdata/epochLabels.npy') labels= np.load('numdata/LOO.npy') print(X.shape,Y.shape) X,Y = deleteClass(X,Y,330,2) X,Y = deleteClass(X,Y,70,1) if sys.argv[1]=='-first': print(X.shape, Y.shape, labels.shape) folds=10 #Pipeline stuff forest = RandomForestRegressor(n_estimators=100, n_jobs = -1) scaler = preprocessing.StandardScaler() lolo = LeaveOneLabelOut(labels) print(lolo,len(lolo)) acc = 0 us = UnderSampler(verbose=True) #X,Y = us.fit_transform(X,Y) kf = KFold(Y.shape[0],n_folds=folds) for train_index,test_index in lolo: print(len(train_index),len(test_index)) Xtrain,Xtest = X[train_index], X[test_index] ytrain,ytest = Y[train_index], Y[test_index] forest.fit(Xtrain,ytrain) scores = forest.predict(Xtest) #acc += tolAcc(ytest,scores) print(acc/folds) # Ensemble Random Forest Regressor stacked with Random Forest Classifier elif sys.argv[1]=='-ensemble': RF = [] outputRF = [] outRFtest=[] us = UnderSampler(verbose=True) cc = ClusterCentroids(verbose=True) #X,Y = cc.fit_transform(X,Y) print(X.shape,Y.shape) # separating features into categories for Ensemble Training activityData = X[:,0:3 ] screenData = X[:,3:14] conversationData = X[:,14:20 ] colocationData = X[:,20:26] audioData = X[:,26:X.shape[1]] # Custom Nested Cross-Validation # Indexes is used to split the dataset in a 40/40/20 manner # NOTE: 30/30/40 seemed to produce very similar results indexes = np.array([i for i in range(X.shape[0])]) np.random.shuffle(indexes) lolo = LeaveOneLabelOut(labels) # print(lolo,len(lolo)) # separating data to 3 subsets: # 1) Train RF # 2) Get RF outputs with which train NN # 3) Test NN output on the rest train_index = indexes[0: int(0.5*X.shape[0])] train_index2 = indexes[int(0.5*X.shape[0]):int(0.8*X.shape[0])] test_index = indexes[int(0.8*X.shape[0]):X.shape[0]] print(len(train_index),len(train_index2),len(test_index )) # Training 5 regressors on 5 types of features i=0 for data in [activityData,screenData,conversationData,colocationData,audioData]: RF.append(RandomForestRegressor(n_estimators=300,max_features=data.shape[1],n_jobs=-1)) RF[i].fit(data[train_index],Y[train_index]) outputRF.append( RF[i].predict(data[train_index2]) ) outRFtest.append(RF[i].predict(data[test_index])) i += 1 middleTrainMat = np.transpose(np.array(outputRF)) testMat = np.transpose(np.array(outRFtest)) # RF classifier to combine regressors class_weights={0 : 1, 1 : 0.5 , 2 : 0.1 , 3 : 0.6, 4 :1} print(class_weights) rfr= ExtraTreesClassifier(n_estimators=300,class_weight=class_weights,n_jobs=-1) rfr.fit(middleTrainMat,Y[train_index2]) print(middleTrainMat.shape) pred = rfr.predict(testMat) # Print to screen mean error and Tolerance Score print(tolAcc(Y[test_index],pred,testMat))
while True: scores = [] for train_index, test_index in skf: X, X_cv = orig_X[train_index], orig_X[test_index] y, y_cv = orig_y[train_index], orig_y[test_index] # Fraction of majority samples to draw with respect to samples of # minority class. sampled_X,sampled_y = X,y # Oversample data from the minority class. if P['is_smote']: sampled_X, sampled_y = SMOTE(k=P['k'], m=P['m'], ratio=P['ratio'], verbose=False, kind='regular').fit_transform(sampled_X, sampled_y) # Undersample samples from the majority class. sampled_X, sampled_y = UnderSampler(1.0).fit_transform(sampled_X, sampled_y) # Fit a scaler only for the sampled data. scaler = Scaler(sampled_X, sampled_y) sampled_X = scaler.getOriginalTransformedData() #model = RandomForestClassifier(n_estimators=100).fit(sampled_X, sampled_y) #model = RandomForestClassifier(n_estimators=P['n_estimators'], criterion=P['criterion'], max_depth=P['max_depth'], min_samples_split=P['min_samples_split'], min_samples_leaf=P['min_samples_leaf'], min_weight_fraction_leaf=P['min_weight_fraction_leaf'], max_features=P['max_features'], max_leaf_nodes=P['max_leaf_nodes'], bootstrap=P['bootstrap'], oob_score=P['oob_score'], n_jobs=8, random_state=None, verbose=0, warm_start=False, class_weight=None).fit(sampled_X, sampled_y) model = MLPClassifier(activation=P['activation'], algorithm=P['algorithm'], alpha=P['alpha'], hidden_layer_sizes=P['layer'], learning_rate=P['learning_rate'], tol=P['tol'], random_state=1).fit(sampled_X, sampled_y) #model = xgb.XGBClassifier(max_depth=P['max_depth'], n_estimators=P['n_estimators'], learning_rate=P['learning_rate'], nthread=8, subsample=P['subsample'], colsample_bylevel=P['colsample_bylevel']).fit(sampled_X, sampled_y, eval_metric=P['eval_metric']) prediction_cv = model.predict_proba(scaler.transform(X_cv)) auc_score = roc_auc_score(y_cv, prediction_cv[:,1]) scores.append(auc_score) log("***roc_auc_score:%f" % auc_score) avg = np.average(scores) var = np.var(scores)
modelFileName = 'GBDT300Dec8M1011UL1F12.pkl' print 'modelFileName: ', modelFileName start = time.time() Xtrain, Ytrain = GetXY(tableTrain) end = time.time() print "Get Train XY Over: ", end - start # model = LogisticRegression() # model = RandomForestClassifier(n_estimators=200) model = GradientBoostingClassifier(n_estimators=300) # model = AdaBoostClassifier() start = time.time() US = UnderSampler(ratio=8.) # US = ClusterCentroids(ratio=5.) Xtrain1, Ytrain1 = US.fit_transform(Xtrain, Ytrain) end = time.time() print "Data decimation time: ", end - start start = time.time() model.fit(Xtrain1, Ytrain1) joblib.dump(model, modelFilePath + modelFileName) end = time.time() print "model train time: ", end - start # print metrics.classification_report(model.predict(Xtrain), Ytrain) pYtrain = model.predict_proba(Xtrain)[:, 1] pYtrain = map(lambda x: 1 if x > 0.4 else 0, pYtrain) submitNum = sum(pYtrain) allPosNum = sum(Ytrain)