def test_incremental_basic(scheduler, xy_classification): X, y = xy_classification with scheduler() as (s, [a, b]): est1 = SGDClassifier(random_state=0, tol=1e-3) est2 = clone(est1) clf = Incremental(est1) result = clf.fit(X, y, classes=[0, 1]) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1]) assert result is clf assert isinstance(result.estimator.coef_, np.ndarray) np.testing.assert_array_almost_equal(result.estimator.coef_, est2.coef_) assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_']) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) assert_eq(result, expected) # score result = clf.score(X, y) expected = est2.score(X, y) # assert isinstance(result, da.Array) assert_eq(result, expected) clf = Incremental(SGDClassifier(random_state=0, tol=1e-3)) clf.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])
def test_incremental_basic(scheduler, dataframes): # Create observations that we know linear models can recover n, d = 100, 3 rng = da.random.RandomState(42) X = rng.normal(size=(n, d), chunks=30) coef_star = rng.uniform(size=d, chunks=d) y = da.sign(X.dot(coef_star)) y = (y + 1) / 2 if dataframes: X = dd.from_array(X) y = dd.from_array(y) with scheduler() as (s, [_, _]): est1 = SGDClassifier(random_state=0, tol=1e-3, average=True) est2 = clone(est1) clf = Incremental(est1, random_state=0) result = clf.fit(X, y, classes=[0, 1]) assert result is clf # est2 is a sklearn optimizer; this is just a benchmark if dataframes: X = X.to_dask_array(lengths=True) y = y.to_dask_array(lengths=True) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_].compute(), y[slice_[0]].compute(), classes=[0, 1]) assert isinstance(result.estimator_.coef_, np.ndarray) rel_error = np.linalg.norm(clf.coef_ - est2.coef_) rel_error /= np.linalg.norm(clf.coef_) assert rel_error < 0.9 assert set(dir(clf.estimator_)) == set(dir(est2)) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) if dataframes: # Compute is needed because chunk sizes of this array are unknown result = result.compute() rel_error = np.linalg.norm(result - expected) rel_error /= np.linalg.norm(expected) assert rel_error < 0.3 # score result = clf.score(X, y) expected = est2.score(*dask.compute(X, y)) assert abs(result - expected) < 0.1 clf = Incremental(SGDClassifier(random_state=0, tol=1e-3, average=True)) clf.partial_fit(X, y, classes=[0, 1]) assert set(dir(clf.estimator_)) == set(dir(est2))
def run_on_blobs(): x, y = dask_ml.datasets.make_blobs(n_samples=1e8, chunks=1e5, random_state=0, centers=3) x = dd.dataframe.from_array(x) y = dd.dataframe.from_array(y) print(f"Rows: {x.shape[0].compute()}") ests_per_chunk = 4 chunks = len(x.divisions) srfc = Incremental(StreamingRFC(n_estimators_per_chunk=ests_per_chunk, max_n_estimators=np.inf, verbose=1, n_jobs=4)) srfc.fit(x, y, classes=y.unique().compute())
def test_incremental_basic(scheduler): # Create observations that we know linear models can recover n, d = 100, 3 rng = da.random.RandomState(42) X = rng.normal(size=(n, d), chunks=30) coef_star = rng.uniform(size=d, chunks=d) y = da.sign(X.dot(coef_star)) y = (y + 1) / 2 with scheduler() as (s, [_, _]): est1 = SGDClassifier(random_state=0, tol=1e-3, average=True) est2 = clone(est1) clf = Incremental(est1, random_state=0) result = clf.fit(X, y, classes=[0, 1]) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1]) assert result is clf assert isinstance(result.estimator_.coef_, np.ndarray) rel_error = np.linalg.norm(clf.coef_ - est2.coef_) rel_error /= np.linalg.norm(clf.coef_) assert rel_error < 0.9 assert set(dir(clf.estimator_)) == set(dir(est2)) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) rel_error = np.linalg.norm(result - expected) rel_error /= np.linalg.norm(expected) assert rel_error < 0.2 # score result = clf.score(X, y) expected = est2.score(X, y) assert abs(result - expected) < 0.1 clf = Incremental(SGDClassifier(random_state=0, tol=1e-3, average=True)) clf.partial_fit(X, y, classes=[0, 1]) assert set(dir(clf.estimator_)) == set(dir(est2))
encPD = enc_xtrainPD.fit(x_trainPD) #OneHotEncoder instance encoding X data x_trainPD = encPD.transform(x_trainPD) x_testPD = encPD.transform(x_testPD) ##Create Scaler instance and scale data for MLP. scalerPD = StandardScaler(with_mean=False) scalerPD.fit(x_trainPD) #Scale Data using scaled instance x_trainPDscaled = scalerPD.transform(x_trainPD) x_testPDscaled = scalerPD.transform(x_testPD) ##### MLP Model learnermlpPD.fit(x_trainPDscaled, y_trainPD, classes=numpy.unique(y_trainPD)) print('PD done training model') #result = mlpPD.predict([[1,1,1,1]]) #prob_results = mlpPD.predict_proba([[1,1,1,1]]) ##### Random Forest Model ##learnerrfPD.fit(x_trainPDscaled,y_trainPD, classes=numpy.unique(y_trainPD)) ## ##forest_result = model.predict([[1,1,1,1]]) ##forest_prob_result = model.predict_proba([[1,1,1,1]]) #####Testing predictions_mlpPD = learnermlpPD.predict(x_testPDscaled) ##predictions_forestPD = learnerrfPD.predict(x_testPDscaled) print('PD done predicting')
def test_scoring(scheduler, xy_classification, scoring=dask_ml.metrics.accuracy_score): X, y = xy_classification with scheduler() as (s, [a, b]): clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring) with pytest.raises(ValueError, match="metric function rather than a scorer"): clf.fit(X, y, classes=np.unique(y))
print('Logistic Regression Score : ', lr.score(X_test, y_test).compute()) ##### OUTPUT --------> Logistic Regression Score : 0.70025 ##################################################################################### # Fitting the Naive Bayes Classifier from sklearn.naive_bayes import BernoulliNB from dask_ml.wrappers import Incremental nb = BernoulliNB() parallel_nb = Incremental(nb) with ProgressBar(): parallel_nb.fit(X_train, y_train, classes=np.unique(y_train.compute())) print('\n\nNaive Bayes Classifier Score : ', parallel_nb.score(X_test, y_test)) ##### OUTPUT --------> Naive Bayes Classifier Score : 0.65 ###################################################################################### # Performing GridSearch on the Logistic Regression Classifier from dask_ml.model_selection import GridSearchCV parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]} lr = LogisticRegression() tuned_lr = GridSearchCV(lr, parameters)
def main(): t0 = time.time() basepath = "/home/eline/OneDrive/__NiiFormat1" # Path to the patient folders. patientPaths, patientIDs = getData.GetPatients(basepath) patientIDs = np.array(patientIDs) # Choose which scans to include. t2 = ["T2"] dwi = [ "DWI_b00", "DWI_b01", "DWI_b02", "DWI_b03", "DWI_b04", "DWI_b05", "DWI_b06" ] ffe = [] t1t2sense = [] scantypes = [t2, dwi, ffe, t1t2sense] scans = [] for type in scantypes: if type: scans.append(type) # Choose the mask/ground truth. maskchoice = "union" # an, shh, intersection or union # Creating dictionaries to store patient image data and the masks. dataDict, groundTruthDict, imsizes = buildData.buildDataset( patientPaths, patientIDs, scans, maskchoice) # Choose cross-validator. crossvalidator = options.select_cross_validator( "leave-One-Out") # K-fold or leave-One-Out loadtime = time.time() zeroIndex = {} dice = [] # Train model. for train_index, test_index in crossvalidator.split(patientIDs): # First splitting the data and building dask arrays. trainingX, trainingY = buildData.get_data_for_training( dataDict, groundTruthDict, patientIDs[train_index]) testX, testY = buildData.get_data_for_test(dataDict, groundTruthDict, patientIDs[test_index], zeroIndex) # Using incremental learning (out of core learning) because of the large amount of data. estimator = sklearn.linear_model.SGDClassifier( ) # Estimator have to have partial_fit API implemented. clf = Incremental(estimator, scoring='accuracy') clf.fit(trainingX, trainingY, classes=[True, False]) data = clf.predict(testX) # Per patient predictions. index = 0 for patientID in patientIDs[test_index]: # Get the voxels belonging to the patient. size = len(groundTruthDict[patientID]) pred = data[index:index + size].compute( ) # compute() is needed to access the values in a Dask array. truth = testY[index:index + size].compute() # Set rows which contained at least one zero as background (0). for element in zeroIndex[patientID]: pred[element] = 0 # Remove small areas/volumes from the predicted mask. pred = processResults.remove_small_areas2D(pred, imsizes[patientID]) #pred = processResults.remove_small_areas3D(pred, imsizes[patientID]) # Calculate the confusion matrix. confusionMatrix = confusion_matrix(truth, pred) # Calculate the DICE score. diceScore = processResults.calculate_dice(confusionMatrix) dice.append([patientID, diceScore]) # Save prediction as nifti file. filename = 'predict' + patientID + '.nii' predimage = processResults.array_to_image(pred, imsizes[patientID]) sitk.WriteImage(predimage, filename) # Increase index to the starting index of the next patient. index += size t1 = time.time() print('loadtime: ' + str(loadtime - t0)) print('traintime: ' + str(t1 - loadtime)) print('runtime: ' + str(t1 - t0)) # Save the DICE scores in a text file. processResults.save_dice_scores(dice, "diceScores") # Calculate the mean DSC value. sum = 0 n = 0 for i in dice: sum += i[1] n += 1 print(sum / n)