コード例 #1
0
ファイル: test_incremental.py プロジェクト: laprej/dask-ml
def test_incremental_basic(scheduler, xy_classification):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        est1 = SGDClassifier(random_state=0, tol=1e-3)
        est2 = clone(est1)

        clf = Incremental(est1)
        result = clf.fit(X, y, classes=[0, 1])
        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])

        assert result is clf

        assert isinstance(result.estimator.coef_, np.ndarray)
        np.testing.assert_array_almost_equal(result.estimator.coef_,
                                             est2.coef_)

        assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        assert_eq(result, expected)

        # score
        result = clf.score(X, y)
        expected = est2.score(X, y)
        # assert isinstance(result, da.Array)
        assert_eq(result, expected)

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
        clf.partial_fit(X, y, classes=[0, 1])
        assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])
コード例 #2
0
def test_incremental_basic(scheduler, dataframes):
    # Create observations that we know linear models can recover
    n, d = 100, 3
    rng = da.random.RandomState(42)
    X = rng.normal(size=(n, d), chunks=30)
    coef_star = rng.uniform(size=d, chunks=d)
    y = da.sign(X.dot(coef_star))
    y = (y + 1) / 2
    if dataframes:
        X = dd.from_array(X)
        y = dd.from_array(y)

    with scheduler() as (s, [_, _]):
        est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
        est2 = clone(est1)

        clf = Incremental(est1, random_state=0)
        result = clf.fit(X, y, classes=[0, 1])
        assert result is clf

        # est2 is a sklearn optimizer; this is just a benchmark
        if dataframes:
            X = X.to_dask_array(lengths=True)
            y = y.to_dask_array(lengths=True)

        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_].compute(),
                             y[slice_[0]].compute(),
                             classes=[0, 1])

        assert isinstance(result.estimator_.coef_, np.ndarray)
        rel_error = np.linalg.norm(clf.coef_ - est2.coef_)
        rel_error /= np.linalg.norm(clf.coef_)
        assert rel_error < 0.9

        assert set(dir(clf.estimator_)) == set(dir(est2))

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        if dataframes:
            # Compute is needed because chunk sizes of this array are unknown
            result = result.compute()
        rel_error = np.linalg.norm(result - expected)
        rel_error /= np.linalg.norm(expected)
        assert rel_error < 0.3

        # score
        result = clf.score(X, y)
        expected = est2.score(*dask.compute(X, y))
        assert abs(result - expected) < 0.1

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3,
                                        average=True))
        clf.partial_fit(X, y, classes=[0, 1])
        assert set(dir(clf.estimator_)) == set(dir(est2))
コード例 #3
0
def run_on_blobs():
    x, y = dask_ml.datasets.make_blobs(n_samples=1e8,
                                       chunks=1e5,
                                       random_state=0,
                                       centers=3)

    x = dd.dataframe.from_array(x)
    y = dd.dataframe.from_array(y)

    print(f"Rows: {x.shape[0].compute()}")

    ests_per_chunk = 4
    chunks = len(x.divisions)

    srfc = Incremental(StreamingRFC(n_estimators_per_chunk=ests_per_chunk,
                                    max_n_estimators=np.inf,
                                    verbose=1,
                                    n_jobs=4))
    srfc.fit(x, y,
             classes=y.unique().compute())
コード例 #4
0
def test_incremental_basic(scheduler):
    # Create observations that we know linear models can recover
    n, d = 100, 3
    rng = da.random.RandomState(42)
    X = rng.normal(size=(n, d), chunks=30)
    coef_star = rng.uniform(size=d, chunks=d)
    y = da.sign(X.dot(coef_star))
    y = (y + 1) / 2

    with scheduler() as (s, [_, _]):
        est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
        est2 = clone(est1)

        clf = Incremental(est1, random_state=0)
        result = clf.fit(X, y, classes=[0, 1])
        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])

        assert result is clf

        assert isinstance(result.estimator_.coef_, np.ndarray)
        rel_error = np.linalg.norm(clf.coef_ - est2.coef_)
        rel_error /= np.linalg.norm(clf.coef_)
        assert rel_error < 0.9

        assert set(dir(clf.estimator_)) == set(dir(est2))

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        rel_error = np.linalg.norm(result - expected)
        rel_error /= np.linalg.norm(expected)
        assert rel_error < 0.2

        # score
        result = clf.score(X, y)
        expected = est2.score(X, y)
        assert abs(result - expected) < 0.1

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3,
                                        average=True))
        clf.partial_fit(X, y, classes=[0, 1])
        assert set(dir(clf.estimator_)) == set(dir(est2))
コード例 #5
0
encPD = enc_xtrainPD.fit(x_trainPD)

#OneHotEncoder instance encoding X data
x_trainPD = encPD.transform(x_trainPD)
x_testPD = encPD.transform(x_testPD)

##Create Scaler instance and scale data for MLP.
scalerPD = StandardScaler(with_mean=False)
scalerPD.fit(x_trainPD)

#Scale Data using scaled instance
x_trainPDscaled = scalerPD.transform(x_trainPD)
x_testPDscaled = scalerPD.transform(x_testPD)

##### MLP Model
learnermlpPD.fit(x_trainPDscaled, y_trainPD, classes=numpy.unique(y_trainPD))
print('PD done training model')
#result = mlpPD.predict([[1,1,1,1]])
#prob_results = mlpPD.predict_proba([[1,1,1,1]])

##### Random Forest Model
##learnerrfPD.fit(x_trainPDscaled,y_trainPD, classes=numpy.unique(y_trainPD))
##
##forest_result = model.predict([[1,1,1,1]])
##forest_prob_result = model.predict_proba([[1,1,1,1]])

#####Testing
predictions_mlpPD = learnermlpPD.predict(x_testPDscaled)
##predictions_forestPD = learnerrfPD.predict(x_testPDscaled)
print('PD done predicting')
コード例 #6
0
def test_scoring(scheduler, xy_classification, scoring=dask_ml.metrics.accuracy_score):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring)
        with pytest.raises(ValueError, match="metric function rather than a scorer"):
            clf.fit(X, y, classes=np.unique(y))
コード例 #7
0
print('Logistic Regression Score : ', lr.score(X_test, y_test).compute())
##### OUTPUT --------> Logistic Regression Score :  0.70025

#####################################################################################

# Fitting the Naive Bayes Classifier
from sklearn.naive_bayes import BernoulliNB
from dask_ml.wrappers import Incremental

nb = BernoulliNB()

parallel_nb = Incremental(nb)

with ProgressBar():
    parallel_nb.fit(X_train, y_train, classes=np.unique(y_train.compute()))

print('\n\nNaive Bayes Classifier Score : ', parallel_nb.score(X_test, y_test))
##### OUTPUT --------> Naive Bayes Classifier Score :  0.65

######################################################################################

# Performing GridSearch on the Logistic Regression Classifier
from dask_ml.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]}

lr = LogisticRegression()

tuned_lr = GridSearchCV(lr, parameters)
コード例 #8
0
def main():
    t0 = time.time()

    basepath = "/home/eline/OneDrive/__NiiFormat1"  # Path to the patient folders.
    patientPaths, patientIDs = getData.GetPatients(basepath)
    patientIDs = np.array(patientIDs)

    # Choose which scans to include.
    t2 = ["T2"]
    dwi = [
        "DWI_b00", "DWI_b01", "DWI_b02", "DWI_b03", "DWI_b04", "DWI_b05",
        "DWI_b06"
    ]
    ffe = []
    t1t2sense = []

    scantypes = [t2, dwi, ffe, t1t2sense]
    scans = []
    for type in scantypes:
        if type:
            scans.append(type)

    # Choose the mask/ground truth.
    maskchoice = "union"  # an, shh, intersection or union

    # Creating dictionaries to store patient image data and the masks.
    dataDict, groundTruthDict, imsizes = buildData.buildDataset(
        patientPaths, patientIDs, scans, maskchoice)

    # Choose cross-validator.
    crossvalidator = options.select_cross_validator(
        "leave-One-Out")  # K-fold or leave-One-Out

    loadtime = time.time()

    zeroIndex = {}

    dice = []

    # Train model.
    for train_index, test_index in crossvalidator.split(patientIDs):
        # First splitting the data and building dask arrays.
        trainingX, trainingY = buildData.get_data_for_training(
            dataDict, groundTruthDict, patientIDs[train_index])
        testX, testY = buildData.get_data_for_test(dataDict, groundTruthDict,
                                                   patientIDs[test_index],
                                                   zeroIndex)

        # Using incremental learning (out of core learning) because of the large amount of data.
        estimator = sklearn.linear_model.SGDClassifier(
        )  # Estimator have to have partial_fit API implemented.
        clf = Incremental(estimator, scoring='accuracy')
        clf.fit(trainingX, trainingY, classes=[True, False])
        data = clf.predict(testX)

        # Per patient predictions.
        index = 0
        for patientID in patientIDs[test_index]:
            # Get the voxels belonging to the patient.
            size = len(groundTruthDict[patientID])
            pred = data[index:index + size].compute(
            )  # compute() is needed to access the values in a Dask array.
            truth = testY[index:index + size].compute()

            # Set rows which contained at least one zero as background (0).
            for element in zeroIndex[patientID]:
                pred[element] = 0

            # Remove small areas/volumes from the predicted mask.
            pred = processResults.remove_small_areas2D(pred,
                                                       imsizes[patientID])
            #pred = processResults.remove_small_areas3D(pred, imsizes[patientID])

            # Calculate the confusion matrix.
            confusionMatrix = confusion_matrix(truth, pred)

            # Calculate the DICE score.
            diceScore = processResults.calculate_dice(confusionMatrix)
            dice.append([patientID, diceScore])

            # Save prediction as nifti file.
            filename = 'predict' + patientID + '.nii'
            predimage = processResults.array_to_image(pred, imsizes[patientID])
            sitk.WriteImage(predimage, filename)

            # Increase index to the starting index of the next patient.
            index += size

    t1 = time.time()
    print('loadtime: ' + str(loadtime - t0))
    print('traintime: ' + str(t1 - loadtime))
    print('runtime: ' + str(t1 - t0))

    # Save the DICE scores in a text file.
    processResults.save_dice_scores(dice, "diceScores")

    # Calculate the mean DSC value.
    sum = 0
    n = 0
    for i in dice:
        sum += i[1]
        n += 1
    print(sum / n)