Ejemplo n.º 1
0
def read_subpop_data(one_hot=True, fake_data=False, test_size=0.2, undersample=False):

    labeled_dic = convert_txt_to_npy(LABELED_RL_PATH)
    unlabeled_dic = convert_txt_to_npy(UNLABELED_RL_PATH, labeled=False)
    X_train, X_test, y_train, y_test = split_train_test(labeled_dic, test_size=test_size)

    class DataSets(object):
        pass
    data_sets = DataSets()
    
    if undersample:
        from unbalanced_dataset import UnderSampler 
        US = UnderSampler(verbose=True)
        X_train, y_train = US.fit_transform(X_train, y_train)
        
    lda = LDA()
    lda.fit(X_train, y_train)
    score = metrics.accuracy_score(lda.predict(X_test), y_test)
    print("Baseline LDA: %f " % score)

    if one_hot:
        y_train = convert_to_one_hot(y_train)
        y_test = convert_to_one_hot(y_test)

    data_sets = DataSets()
    data_sets.test = DataSet(X_test, y_test)
    data_sets.train = SemiDataSet(unlabeled_dic['data'], X_train, y_train)

    return data_sets
Ejemplo n.º 2
0
def undersampling(x, y, ratio=15, xlab=None, xtest=None, ytest=None, ylab=None, lab=None, prefix='', stype="under"):
    # 'Random under-sampling'

    parameters = {'random_state': 12345, 'max_features': None,
                  'oob_score': True}  # , 'class_weight': 'balanced'}
    c = Classifier('rf', parameters, False)
    xtrain = x[:, :-1]
    # undersampler
    US = UnderSampler(ratio=ratio, verbose=False, random_state=12345)
    usx, usy = US.fit_transform(x, y)
    if not xlab:
        xlab = usx[:, -1]
        usx = usx[:, :-1]
    seuil = threshold_tuning(usx, usy, xlab, title=prefix + 'undersample')
    s2 = threshold_tuning(usx, usy, xlab, metric=roc_auc_score,
                          title=prefix + 'undersample_roc')
    s2 = threshold_tuning(usx, usy, xlab, metric=accuracy_score,
                          title=prefix + 'undersample_acc')
    tree = estimator_tree_tuning(
        usx, usy, pvalidator=xlab, title=prefix + 'OverSampler_trees')
    if xtest is not None:
        c.train(usx, usy)
        print('====> undersample')
        print("------------Result on yeast")
        test_and_print(xtest, ytest, ylab, c, seuil)
        print("\n------------Result on metazoa")
        test_and_print(xtrain, y, lab, c, seuil)
    return c
 def split(self, x_data, y_data):
     Xt, Yt, Xv, Yv = super(SMOTESplitter, self).split(x_data, y_data)
     Xt_smote, Yt_smote = SMOTE(**self._smote_params).fit_transform(
         Xt.as_matrix(), Yt.as_matrix())
     Xt_smote, Yt_smote = UnderSampler(
         ratio=self._under_sample).fit_transform(Xt_smote, Yt_smote)
     return Xt_smote, Yt_smote, Xv, Yv
 def split(self, x_data, y_data):
     Xt, Yt, Xv, Yv = super(OverUnderSplitter, self).split(x_data, y_data)
     Xt_smote, Yt_smote = OverSampler(
         ratio=self._over_sample).fit_transform(Xt.as_matrix(),
                                                Yt.as_matrix())
     Xt_smote, Yt_smote = UnderSampler(
         ratio=self._under_sample).fit_transform(Xt_smote, Yt_smote)
     return Xt_smote, Yt_smote, Xv, Yv
Ejemplo n.º 5
0
def downsample(d, response, random_state=None, preserve_index=False, verbose=True):
    """ Downsample data frame
    :param d: Data frame to be downsampled
    :param response: Field within data frame to use for downsampling (must contain
        only two unique values; an error will be thrown otherwise)
    :param random_state: Random state to use for downsampling
    :param preserve_index: Determines whether or not the index associated with the given
        data frame will be reattached to the downsampled result; if true, the names of all
        index fields must be non-null
    :param verbose: Flag indicating whether or not summaries of class frequencies should be printed
    :return: Data frame idential to "d" with some rows removed, and the values in "response"
        occurring with an equal frequency
    """
    from unbalanced_dataset import UnderSampler
    sampler = UnderSampler(random_state=random_state, replacement=False, verbose=verbose)
    idx = None

    # If index preservation is requested, store the index field names before reseting it
    # on the input data frame (and make sure none of the names are null)
    if preserve_index:
        assert not np.any(pd.isnull(d.index.names)), \
            'When downsampling with "preserve_index=True", index field names must all be non-null.  ' \
            'At least one name was null for the given index.  Index names given: {}'.format(d.index.names)
        idx = list(d.index.names)
        d = d.reset_index()

    # Capture original data frame types and column names
    dtypes = d.dtypes.to_dict()
    cols = d.columns

    # Ensure that the field to be used for downsampling is present
    assert response in cols, \
        'Given response to use for downsampling "{}" was not found in dataset to be downsampled'.format(response)

    # Downsample dataset (as numpy arrays)
    ds, _ = sampler.fit_transform(d.values, d[response].values)

    # Re-conform resampled frame to original (add cols + index)
    d = pd.DataFrame(ds, columns=cols)
    for c in d:
        d[c] = d[c].astype(dtypes[c])
    if preserve_index:
        d = d.set_index(idx)

    # Return result
    return d
Ejemplo n.º 6
0
def test_rest(x, y):

    print('Random under-sampling')
    US = UnderSampler(verbose=verbose)
    usx, usy = US.fit_transform(x, y)

    print('Tomek links')
    TL = TomekLinks(verbose=verbose)
    tlx, tly = TL.fit_transform(x, y)

    print('Clustering centroids')
    CC = ClusterCentroids(verbose=verbose)
    ccx, ccy = CC.fit_transform(x, y)

    print('NearMiss-1')
    NM1 = NearMiss(version=1, verbose=verbose)
    nm1x, nm1y = NM1.fit_transform(x, y)

    print('NearMiss-2')
    NM2 = NearMiss(version=2, verbose=verbose)
    nm2x, nm2y = NM2.fit_transform(x, y)

    print('NearMiss-3')
    NM3 = NearMiss(version=3, verbose=verbose)
    nm3x, nm3y = NM3.fit_transform(x, y)

    print('Neighboorhood Cleaning Rule')
    NCR = NeighbourhoodCleaningRule(verbose=verbose)
    ncrx, ncry = NCR.fit_transform(x, y)

    print('Random over-sampling')
    OS = OverSampler(verbose=verbose)
    ox, oy = OS.fit_transform(x, y)

    print('SMOTE Tomek links')
    STK = SMOTETomek(verbose=verbose)
    stkx, stky = STK.fit_transform(x, y)

    print('SMOTE ENN')
    SENN = SMOTEENN(verbose=verbose)
    sennx, senny = SENN.fit_transform(x, y)

    print('EasyEnsemble')
    EE = EasyEnsemble(verbose=verbose)
    eex, eey = EE.fit_transform(x, y)
Ejemplo n.º 7
0
def _parallel_build_trees(tree, forest, X, y):
    if forest.sampling is None:
        sampler = BootstrapSampler(random_state=tree.random_state)
    elif forest.sampling == 'up':
        sampler = OverSampler(random_state=tree.random_state, verbose=False)
    elif forest.sampling == 'down':
        sampler = UnderSampler(random_state=tree.random_state, verbose=False)

    X_sample, y_sample = sampler.fit_transform(X, y)
    tree.fit(X_sample, y_sample, check_input=False)
    return tree
Ejemplo n.º 8
0
def apply_sampling(X_data, Y_data, sampling, n_states, maxlen):
    ratio = float(np.count_nonzero(Y_data == 1)) / \
        float(np.count_nonzero(Y_data == 0))
    X_data = np.reshape(X_data, (len(X_data), n_states * maxlen))
    # 'Random over-sampling'
    if sampling == 'OverSampler':
        OS = OverSampler(ratio=ratio, verbose=True)
    # 'Random under-sampling'
    elif sampling == 'UnderSampler':
        OS = UnderSampler(verbose=True)
    # 'Tomek under-sampling'
    elif sampling == 'TomekLinks':
        OS = TomekLinks(verbose=True)
    # Oversampling
    elif sampling == 'SMOTE':
        OS = SMOTE(ratio=1, verbose=True, kind='regular')
    # Oversampling - Undersampling
    elif sampling == 'SMOTETomek':
        OS = SMOTETomek(ratio=ratio, verbose=True)
    # Undersampling
    elif sampling == 'OneSidedSelection':
        OS = OneSidedSelection(verbose=True)
    # Undersampling
    elif sampling == 'CondensedNearestNeighbour':
        OS = CondensedNearestNeighbour(verbose=True)
    # Undersampling
    elif sampling == 'NearMiss':
        OS = NearMiss(version=1, verbose=True)
    # Undersampling
    elif sampling == 'NeighbourhoodCleaningRule':
        OS = NeighbourhoodCleaningRule(verbose=True)
    # ERROR: WRONG SAMPLER, TERMINATE
    else:
        print('Wrong sampling variable you have set... Exiting...')
        sys.exit()
    # print('shape ' + str(X.shape))
    X_data, Y_data = OS.fit_transform(X_data, Y_data)
    return X_data, Y_data
Ejemplo n.º 9
0
def _sample_values(X, y, method=None, ratio=1, verbose=False):
    """Perform any kind of sampling(over and under).

    Parameters
    ----------
    X : array, shape = [n_samples, n_features]
        Data.
    y : array, shape = [n_samples]
        Target.
    method : str, optional default: None
        Over or under smapling method.
    ratio: float
        Unbalanced class ratio.

    Returns
    -------
    X, y : tuple
        Sampled X and y.
    """
    if method == 'SMOTE':
        sampler = SMOTE(ratio=ratio, verbose=verbose)

    elif method == 'SMOTEENN':
        ratio = ratio * 0.3
        sampler = SMOTEENN(ratio=ratio, verbose=verbose)

    elif method == 'random_over_sample':
        sampler = OverSampler(ratio=ratio, verbose=verbose)

    elif method == 'random_under_sample':
        sampler = UnderSampler(verbose=verbose)

    elif method == 'TomekLinks':
        sampler = TomekLinks(verbose=verbose)

    return sampler.fit_transform(X, y)
Ejemplo n.º 10
0
pca = PCA()
X_reduced = pca.fit_transform(X)

# plt.figure(1, figsize=(4, 3))
# plt.clf()
# plt.axes([.2, .2, .7, .7])
# plt.plot(pca.explained_variance_, linewidth=2)
# plt.axis('tight')
# plt.xlabel('n_components')
# plt.ylabel('explained_variance_')

# Generate the new dataset using under-sampling method
verbose = False
# 'Random under-sampling'
# ratio of majority elements to sample with respect to the number of minority cases.
US = UnderSampler(ratio=1., verbose=verbose)
X_reduced, Y = US.fit_transform(X_reduced, Y)

ax.scatter(X_reduced[:, 0],
           X_reduced[:, 1],
           X_reduced[:, 2],
           c=Y,
           cmap=plt.cm.Paired)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])
# outFile3D=sys.argv[2]
Ejemplo n.º 11
0
 def under_sampling(self):
     US = UnderSampler(verbose=self.verbose)
     usx, usy = US.fit_transform(self.x, self.y)
     print "Under Sampling Transformed"
     return usx, usy
colnames = ['old_index','job_id', 'task_idx','sched_cls', 'priority', 'cpu_requested',
            'mem_requested', 'disk', 'violation'] 

tain_path = r'/home/askrey/Dropbox/Project_step_by_step/3_create_database/csvs/frull_db_2.csv'

X = pd.read_csv(tain_path, header = None, index_col = False ,names = colnames, skiprows = [0],  usecols = [3,4,5,6,7])
y = pd.read_csv(tain_path, header = None, index_col = False ,names = colnames, skiprows = [0],  usecols = [8])
y = y['violation'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0)
main_x = X.values
main_y = y

verbose = False

# 'Random under-sampling'
US = UnderSampler(verbose=verbose)
x, y = US.fit_transform(main_x, main_y)

ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0))
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_test, y_test)

y_pred = clf.fit(X_train, y_train).predict(X_test)
y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]

prediction, bias, contributions = ti.predict(clf, X_test)
Ejemplo n.º 13
0
while True:
  scores = []
  for train_index, test_index in skf:
    X, X_cv = orig_X[train_index], orig_X[test_index]
    y, y_cv = orig_y[train_index], orig_y[test_index]
    
    # Fraction of majority samples to draw with respect to samples of
    # minority class.
    sampled_X,sampled_y = X,y
    # Oversample data from the minority class.

    if P['is_smote']:
      sampled_X, sampled_y = SMOTE(k=P['k'], m=P['m'], ratio=P['ratio'], verbose=False, kind='regular').fit_transform(sampled_X, sampled_y)
      # Undersample samples from the majority class.
      sampled_X, sampled_y = UnderSampler(1.0).fit_transform(sampled_X, sampled_y)
    
    # Fit a scaler only for the sampled data.
    scaler = Scaler(sampled_X, sampled_y)
    sampled_X = scaler.getOriginalTransformedData()
    #model = RandomForestClassifier(n_estimators=100).fit(sampled_X, sampled_y)
    #model = RandomForestClassifier(n_estimators=P['n_estimators'], criterion=P['criterion'], max_depth=P['max_depth'], min_samples_split=P['min_samples_split'], min_samples_leaf=P['min_samples_leaf'], min_weight_fraction_leaf=P['min_weight_fraction_leaf'], max_features=P['max_features'], max_leaf_nodes=P['max_leaf_nodes'], bootstrap=P['bootstrap'], oob_score=P['oob_score'], n_jobs=8, random_state=None, verbose=0, warm_start=False, class_weight=None).fit(sampled_X, sampled_y)
    model = MLPClassifier(activation=P['activation'], algorithm=P['algorithm'], alpha=P['alpha'], hidden_layer_sizes=P['layer'], learning_rate=P['learning_rate'], tol=P['tol'], random_state=1).fit(sampled_X, sampled_y)
    #model = xgb.XGBClassifier(max_depth=P['max_depth'], n_estimators=P['n_estimators'], learning_rate=P['learning_rate'], nthread=8, subsample=P['subsample'], colsample_bylevel=P['colsample_bylevel']).fit(sampled_X, sampled_y, eval_metric=P['eval_metric'])
    prediction_cv = model.predict_proba(scaler.transform(X_cv))
    auc_score = roc_auc_score(y_cv, prediction_cv[:,1])
    scores.append(auc_score)
    log("***roc_auc_score:%f" % auc_score)
  
  avg = np.average(scores)
  var = np.var(scores)
Ejemplo n.º 14
0
x_vis = pca.fit_transform(x)

# Plot the original data
# Plot the two classes
plt.scatter(x_vis[y==0, 0], x_vis[y==0, 1], label="Class #0", alpha=0.5, 
            edgecolor=almost_black, facecolor='red', linewidth=0.15)
plt.scatter(x_vis[y==1, 0], x_vis[y==1, 1], label="Class #1", alpha=0.5, 
            edgecolor=almost_black, facecolor='blue', linewidth=0.15)

plt.legend()
plt.show()

# Generate the new dataset using under-sampling method
verbose = False
# 'Random under-sampling'
US = UnderSampler(verbose=verbose)
usx, usy = US.fit_transform(x, y)
# 'Tomek links'
TL = TomekLinks(verbose=verbose)
tlx, tly = TL.fit_transform(x, y)
# 'Clustering centroids'
CC = ClusterCentroids(verbose=verbose)
ccx, ccy = CC.fit_transform(x, y)
# 'NearMiss-1'
NM1 = NearMiss(version=1, verbose=verbose)
nm1x, nm1y = NM1.fit_transform(x, y)
# 'NearMiss-2'
NM2 = NearMiss(version=2, verbose=verbose)
nm2x, nm2y = NM2.fit_transform(x, y)
# 'NearMiss-3'
NM3 = NearMiss(version=3, verbose=verbose)
Ejemplo n.º 15
0
def main(argv):
	X=np.load('numdata/epochFeats.npy')
	Y=np.load('numdata/epochLabels.npy')
	labels= np.load('numdata/LOO.npy')
	print(X.shape,Y.shape)
	X,Y = deleteClass(X,Y,330,2)
	X,Y = deleteClass(X,Y,70,1)



	if sys.argv[1]=='-first':
		print(X.shape, Y.shape, labels.shape)
		folds=10
		#Pipeline stuff 
		forest = RandomForestRegressor(n_estimators=100, n_jobs = -1)
		scaler = preprocessing.StandardScaler()

		lolo = LeaveOneLabelOut(labels)	
		print(lolo,len(lolo))
		acc = 0

		us = UnderSampler(verbose=True)

		#X,Y = us.fit_transform(X,Y)
		kf = KFold(Y.shape[0],n_folds=folds)
		for train_index,test_index in lolo:

			print(len(train_index),len(test_index))
			Xtrain,Xtest = X[train_index], X[test_index]
			ytrain,ytest = Y[train_index], Y[test_index]
			
			forest.fit(Xtrain,ytrain)


			scores = forest.predict(Xtest)
			#acc += tolAcc(ytest,scores)
			
		print(acc/folds)



	# Ensemble Random Forest Regressor stacked with Random Forest Classifier
	elif sys.argv[1]=='-ensemble':
		RF  = []
		outputRF = []
		outRFtest=[]
	
		us = UnderSampler(verbose=True)
		cc = ClusterCentroids(verbose=True)
		#X,Y = cc.fit_transform(X,Y)
		print(X.shape,Y.shape)

		# separating features into categories for Ensemble Training
		activityData = X[:,0:3 ]
		screenData = X[:,3:14]	
		conversationData = X[:,14:20 ]
		colocationData = X[:,20:26]
		audioData = X[:,26:X.shape[1]]

		# Custom Nested Cross-Validation
		# Indexes is used to split the dataset in a 40/40/20 manner
		# NOTE: 30/30/40 seemed to produce very similar results
		indexes = np.array([i for i in range(X.shape[0])])
		np.random.shuffle(indexes)

		lolo = LeaveOneLabelOut(labels)	
	#	print(lolo,len(lolo))
		# separating data to 3 subsets: 
		# 1) Train RF
		# 2) Get RF outputs with which train NN
		# 3) Test NN output on the rest
		train_index = indexes[0: int(0.5*X.shape[0])]
		train_index2 =  indexes[int(0.5*X.shape[0]):int(0.8*X.shape[0])]
		test_index = indexes[int(0.8*X.shape[0]):X.shape[0]]
		print(len(train_index),len(train_index2),len(test_index	))
		# Training 5 regressors on 5 types of features
		i=0
		for data in [activityData,screenData,conversationData,colocationData,audioData]:
			RF.append(RandomForestRegressor(n_estimators=300,max_features=data.shape[1],n_jobs=-1))
			RF[i].fit(data[train_index],Y[train_index])
			outputRF.append( RF[i].predict(data[train_index2]) )
			outRFtest.append(RF[i].predict(data[test_index]))
			i += 1

		middleTrainMat = np.transpose(np.array(outputRF))
		testMat = np.transpose(np.array(outRFtest))
	

		# RF classifier to combine regressors
		class_weights={0 : 1, 1 : 0.5 , 2 : 0.1 , 3 : 0.6, 4 :1}
		print(class_weights)
		rfr= ExtraTreesClassifier(n_estimators=300,class_weight=class_weights,n_jobs=-1)
		rfr.fit(middleTrainMat,Y[train_index2])
		print(middleTrainMat.shape)

		
		pred = rfr.predict(testMat)
		# Print to screen mean error and Tolerance Score
		print(tolAcc(Y[test_index],pred,testMat))
Ejemplo n.º 16
0
import sys
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from unbalanced_dataset import UnderSampler
from mpl_toolkits.mplot3d import Axes3D

X_reduced=pickle.load(open(sys.argv[1], "rb"))
fileName = sys.argv[2]
X, Y = datasets.load_data(fileName)

# Generate the new dataset using under-sampling method
verbose = False
# 'Random under-sampling'
# ratio of majority elements to sample with respect to the number of minority cases.
US = UnderSampler(ratio=1.,verbose=verbose)
X_reduced, Y = US.fit_transform(X_reduced, Y)

# To getter a better understanding of interaction of the dimensions
# plot the first three tsne dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y, cmap=plt.cm.Paired)
ax.set_title("First three tsne directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])
outFile=sys.argv[3]#"pic/tsne_3_t"
Ejemplo n.º 17
0
    modelFileName = 'GBDT300Dec8M1011UL1F12.pkl'
    print 'modelFileName: ', modelFileName

    start = time.time()
    Xtrain, Ytrain = GetXY(tableTrain)
    end = time.time()
    print "Get Train XY Over: ", end - start

    # model = LogisticRegression()
    # model = RandomForestClassifier(n_estimators=200)
    model = GradientBoostingClassifier(n_estimators=300)
    # model = AdaBoostClassifier()

    start = time.time()

    US = UnderSampler(ratio=8.)
    # US = ClusterCentroids(ratio=5.)
    Xtrain1, Ytrain1 = US.fit_transform(Xtrain, Ytrain)
    end = time.time()
    print "Data decimation time: ", end - start

    start = time.time()
    model.fit(Xtrain1, Ytrain1)
    joblib.dump(model, modelFilePath + modelFileName)
    end = time.time()
    print "model train time: ", end - start
    # print metrics.classification_report(model.predict(Xtrain), Ytrain)
    pYtrain = model.predict_proba(Xtrain)[:, 1]
    pYtrain = map(lambda x: 1 if x > 0.4 else 0, pYtrain)
    submitNum = sum(pYtrain)
    allPosNum = sum(Ytrain)
Ejemplo n.º 18
0
    modelFileName = 'GBDT300Dec8M1011UL1F12.pkl'
    print 'modelFileName: ', modelFileName

    start = time.time()
    Xtrain, Ytrain = GetXY(tableTrain)
    end = time.time()
    print "Get Train XY Over: ", end - start

    # model = LogisticRegression()
    # model = RandomForestClassifier(n_estimators=200)
    model = GradientBoostingClassifier(n_estimators=300)
    # model = AdaBoostClassifier()

    start = time.time()

    US = UnderSampler(ratio=8.)
    # US = ClusterCentroids(ratio=5.)
    Xtrain1, Ytrain1 = US.fit_transform(Xtrain, Ytrain)
    end = time.time()
    print "Data decimation time: ", end - start

    start = time.time()
    model.fit(Xtrain1, Ytrain1)
    joblib.dump(model, modelFilePath + modelFileName)
    end = time.time()
    print "model train time: ", end - start
    # print metrics.classification_report(model.predict(Xtrain), Ytrain)
    pYtrain = model.predict_proba(Xtrain)[:, 1]
    pYtrain = map(lambda x: 1 if x > 0.4 else 0, pYtrain)
    submitNum = sum(pYtrain)
    allPosNum = sum(Ytrain)