Beispiel #1
0
    def ppf(self, pits, parameters=None):
        self._check_constraints(parameters)

        scalar = np.isscalar(pits)
        if scalar:
            pits = np.array([pits])
        eta, lam = parameters

        a = self.__const_a(parameters)
        b = self.__const_b(parameters)

        cond = pits < (1 - lam) / 2

        # slight speed up for really large problems
        icdf1 = t._ppf(pits[cond] / (1 - lam), eta)
        icdf2 = t._ppf(.5 + (pits[~cond] - (1 - lam) / 2) / (1 + lam), eta)
        icdf = -999.99 * np.ones_like(pits)
        icdf[cond] = icdf1
        icdf[~cond] = icdf2
        icdf = (icdf * (1 + np.sign(pits - (1 - lam) / 2) * lam) *
                (1 - 2 / eta)**.5 - a)
        icdf = icdf / b

        if scalar:
            icdf = icdf[0]
        return icdf
Beispiel #2
0
def mean_confidence_interval(data, confidence=0.95):
    a = 100.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * t._ppf((1+confidence)/2., n-1)
    m = np.round(m, 3)
    h = np.round(h, 3)
    return m, h
Beispiel #3
0
def confidenceinterval(x: List[float], conf_level=0.95):
    """
    获得置信区间
    """
    a = 1.0 * np.array(x)
    n = len(a)
    m, se = np.mean(a), sem(a)
    h = se * t._ppf((1 + conf_level) / 2., n - 1)
    return m - h, m + h
Beispiel #4
0
def mean_confidence_interval(data, confidence=0.95):
    """
    Given a list or vector of data, this returns the mean, lower, and upper
    confidence intervals to the level of confidence specified (default = 95%
    confidence interval).
    """
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.mean(a), sem(a)
    h = se * t._ppf((1+confidence)/2., n-1)
    return m, m-h, m+h
Beispiel #5
0
def mean_error(config, id_algo, id_inst, seeds):
	x=[]
	for kk in range(seeds):
		if shared_times[id_algo,id_inst+kk*len(config.instances)] >=0.0:
			x.append(shared_times[id_algo,id_inst+kk*len(config.instances)])
	n=len(x)
	if n>=2:
		se=stats.sem(x)
		h = se*t._ppf((1+0.95)/2., n-1)
		return (np.mean(x),h,n)
	else:
		return (np.mean(x),-1,n)
Beispiel #6
0
def mean_error_boxes(config, id_algo, id_inst, seeds):
	x=[]
	for kk in range(seeds):
		if shared_boxes[id_algo,id_inst+kk*len(config.instances)] >=0.0:
			x.append(shared_boxes[id_algo,id_inst+kk*len(config.instances)])
	n=len(x)
	if n>=2:
		se=stats.sem(x)
		h = se*t._ppf((1+0.95)/2., n-1)
		return (np.mean(x),h,n)
	else:
		return (np.mean(x),-1,n)
Beispiel #7
0
def jackknife_bias_correct(pairs,confidence=None,return_all=False,
                           nan_remove=True,return_raw=False):
    '''
    Return jackknife-bias-corrected estimate from estimate-nsamples pairs
    Pairs can be either a list of tuples, or a 2 x nestimates array.
    If 'confidence' is between 0 and 1, return the mean with lower and upper
    bounds at -/+ the confidence interval.
    If 'confidence' is None, return the mean and standard error.
    If 'return_all' is True, return the mean, standard error, number of points,
    and confidence interval size.
    '''
    
    data = asarray(pairs)
    
    if nan_remove:
        data = data[isfinite(data)[:,0],:]
    
    y = data[:,0]
    x = 1./data[:,1]
    n = len(x)
    
    # Compute linear regression and standard error of intercept
    (slope,intercept,r,p,slope_se) = linregress(x,y)
    intercept_se = slope_se * sqrt(ss(x)/n)
    
    # Return mean and SE if no value is specified:
    if confidence is None:
        if return_all:
            if return_raw:
                np = data[:,1]
                max_n = max(np)
                raw_mean = mean(data[np==max_n,0])
                return intercept, intercept_se, n, raw_mean
            else:
                return intercept, intercept_se, n
        else:
            return intercept, intercept_se
    
    # Otherwise return intercept with confidence
    else:
        t_int = t._ppf((1+confidence)/2,n-2)
        intercept_int = t_int * intercept_se
        
        if return_all:
            if return_raw:
                np = data[:,1]
                max_n = max(np)
                raw_mean = mean(data[np==max_n,0])
                return intercept, intercept_se, n, intercept_int, raw_mean
            else:
                return intercept, intercept_se, n, intercept_int
        else:
            return intercept, intercept - intercept_int, intercept + intercept_int
Beispiel #8
0
def lowess(x, y, f=1./3., iter=3, confidence=0.95):
    """
    Performs Lowess smoothing

    Code adapted from: https://gist.github.com/agramfort/850437

    lowess(x, y, f=2./3., iter=3) -> yest

    Lowess smoother: Robust locally weighted regression.
    The lowess function fits a nonparametric regression curve to a scatterplot.
    The arrays x and y contain an equal number of elements; each pair
    (x[i], y[i]) defines a data point in the scatterplot. The function returns
    the estimated (smooth) values of y.

    The smoothing span is given by f. A larger value for f will result in a
    smoother curve. The number of robustifying iterations is given by iter. The
    function will run faster with a smaller number of iterations.

    .. todo:: double check that the confidence bounds are correct
    """
    n = len(x)
    r = int(np.ceil(f*n))
    h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]
    w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0)
    w = (1 - w**3)**3
    yest = np.zeros(n)
    delta = np.ones(n)
    for iteration in range(iter):
        for i in range(n):
            weights = delta * w[:, i]
            b = np.array([np.sum(weights*y), np.sum(weights*y*x)])
            A = np.array([[np.sum(weights), np.sum(weights*x)],
                          [np.sum(weights*x), np.sum(weights*x*x)]])
            beta = linalg.solve(A, b)
            yest[i] = beta[0] + beta[1]*x[i]

        residuals = y - yest
        s = np.median(np.abs(residuals))
        delta = np.clip(residuals / (6.0 * s), -1, 1)
        delta = (1 - delta**2)**2

    h = np.zeros(n)
    for x_idx, x_val in enumerate(x):
        r2 = np.array([v*v for i, v in enumerate(residuals) if x[i] == x_val])
        n = len(r2)
        se = sqrt(mean(r2)) / sqrt(len(r2))
        h[x_idx] = se * t._ppf((1+confidence)/2., n-1)

    return yest, yest-h, yest+h
Beispiel #9
0
def mean_confidence_interval(a, confidence=0.95):
    """
    Helper function for calculating a mean confidence interval

    Athena uses a random forest from Scikit-Learn to select the best parameters. Use this constructor to create a Selection class.

    Parameters
    ----------
    a : ?
        Number of estimators (decision trees) the Random Forest will use.
    confidence : float
        Number of threads the Random Forest will utilize.
    """
    from numpy import mean
    from scipy.stats import sem, t
    n = len(a)
    m, se = mean(a), sem(a)
    h = se * t._ppf((1 + confidence) / 2., n - 1)
    return m, m - h, m + h
Beispiel #10
0
def crossvalidateGTR(data,
                     labels,
                     n_neighbors=1,
                     representation="modes",
                     niter=200,
                     k=0,
                     m=0,
                     doPCA=False,
                     n_components=-1,
                     missing=False,
                     missing_strategy='most_frequent',
                     random_state=1234,
                     l=-1,
                     s=-1,
                     n_folds=5,
                     n_repetitions=10):
    print("")
    print("k = sqrt(grid size), m = sqrt(radial basis function grid size), "
          "l = regularization, s = RBF width factor")
    print("")
    if k == 0:
        k = int(math.sqrt(5 * math.sqrt(data.shape[0]))) + 2
    if m == 0:
        m = int(math.sqrt(k))
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(),
                                       0.8) + 1
        print(
            "Used number of components explaining 80%% of the variance = %s\n"
            % n_components)
    if l < 0.0:
        lvec = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    else:
        lvec = [l]
    if s < 0.0:
        svec = [0.25, 0.5, 1.0, 1.50, 2.0]
    else:
        svec = [s]
    savemean = 999999999
    saveh = 0.0
    modelvec = ""
    savemeanr2 = 0.0
    savehr2 = 0.0
    print("k:m:s:l\tRMSE with CI\tR2 with CI\t")
    for s in svec:
        for l in lvec:
            modelstring = str(s) + ":" + str(l)
            rmsevec = []
            r2vec = []
            for j in range(n_repetitions):
                ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
                y_true = []
                y_pred = []
                for train_index, test_index in ss.split(data):
                    train = np.copy(data[train_index])
                    test = np.copy(data[test_index])
                    prediction = ugtm_predictions.GTR(
                        train=train,
                        labels=labels[train_index],
                        test=test,
                        k=k,
                        m=m,
                        s=s,
                        l=l,
                        n_neighbors=n_neighbors,
                        niter=niter,
                        representation=representation,
                        doPCA=doPCA,
                        n_components=n_components,
                        random_state=random_state,
                        missing=missing,
                        missing_strategy=missing_strategy)
                    y_pred = np.append(y_pred, prediction)
                    y_true = np.append(y_true, labels[test_index])
                rmse = math.sqrt(mean_squared_error(y_true, y_pred))
                r2 = r2_score(y_true, y_pred)
                rmsevec = np.append(rmsevec, rmse)
                r2vec = np.append(r2vec, r2)
            mean, se = np.mean(rmsevec), st.sem(rmsevec)
            h = se * t._ppf((1.0 + 0.95) / 2., len(rmsevec) - 1)
            meanr2, ser2 = np.mean(r2vec), st.sem(r2vec)
            hr2 = ser2 * t._ppf((1.0 + 0.95) / 2., len(r2vec) - 1)
            if (mean < savemean):
                savemean = mean
                saveh = h
                modelvec = modelstring
                savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec)
                savehr2 = saveser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1)
            print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" %
                  (str(k) + ':' + str(m) + ':' + modelstring, mean, h, meanr2,
                   hr2))
    print('')
    print("########best GTR model##########")
    print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" %
          (str(k) + ':' + str(m) + ':' + modelvec, savemean, saveh, savemeanr2,
           savehr2))
    print("")
Beispiel #11
0
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * numpy.array(data)
    mean, se = numpy.mean(a), stats.sem(a)
    h = se * t._ppf((1 + confidence) / 2., len(a) - 1)
    return mean, mean - h, mean + h
Beispiel #12
0
def mean_confidence_interval(x, confidence=0.95):
    a = np.array(x) * 1.0
    mu, se = np.mean(a), scipy.stats.sem(a)
    me = se * t._ppf((1 + confidence) / 2., len(a) - 1)
    return mu, mu - me, mu + me
Beispiel #13
0
def crossvalidatePCAC(data,
                      labels,
                      doPCA=False,
                      n_components=-1,
                      missing=False,
                      missing_strategy='most_frequent',
                      random_state=1234,
                      n_neighbors=1,
                      n_folds=5,
                      n_repetitions=10,
                      maxneighbours=11):
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(),
                                       0.8) + 1
        print("Used number of components "
              "explaining 80%% of the variance = %s\n" % n_components)
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    nClasses = len(uniqClasses)
    print("Classes: ", uniqClasses)
    print("nClasses: ", nClasses)
    print("")
    print("model\tparameters=k_for_kNN\trecall with CI\t"
          "precision with CI\tF1-score with CI")
    print("")
    if n_neighbors <= 0:
        Kvec = np.arange(start=1, stop=maxneighbours, step=1, dtype=np.int32)
    else:
        Kvec = [n_neighbors]

    savemean = -9999
    nummodel = 0
    savemodel = ""
    for c in Kvec:
        nummodel += 1
        modelstring = str(c)
        recallvec = []
        precisionvec = []
        f1vec = []
        recallclassvec = np.array([])
        precisionclassvec = np.array([])
        f1classvec = np.array([])
        meanclass = np.zeros(nClasses)
        meanprecisionclass = np.zeros(nClasses)
        meanf1class = np.zeros(nClasses)
        seclass = np.zeros(nClasses)
        seprecisionclass = np.zeros(nClasses)
        sef1class = np.zeros(nClasses)
        hclass = np.zeros(nClasses)
        hprecisionclass = np.zeros(nClasses)
        hf1class = np.zeros(nClasses)
        for j in range(n_repetitions):
            ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
            y_true = []
            y_pred = []
            for train_index, test_index in ss.split(data):
                train = np.copy(data[train_index])
                test = np.copy(data[test_index])
                processed = ugtm_preprocess.processTrainTest(
                    train, test, doPCA, n_components, missing,
                    missing_strategy)
                y_pred = np.append(
                    y_pred,
                    ugtm_predictions.predictNNSimple(processed.train,
                                                     processed.test,
                                                     labels[train_index], c,
                                                     "classification"))
                y_true = np.append(y_true, labels[test_index])
            recall = recall_score(y_true, y_pred, average='weighted')
            precision = precision_score(y_true, y_pred, average='weighted')
            f1 = f1_score(y_true, y_pred, average='weighted')
            recallvec = np.append(recallvec, recall)
            precisionvec = np.append(precisionvec, precision)
            f1vec = np.append(f1vec, f1)
            recallclass = recall_score(y_true, y_pred, average=None)
            precisionclass = precision_score(y_true, y_pred, average=None)
            f1class = f1_score(y_true, y_pred, average=None)
            if (j == 0):
                recallclassvec = recallclass
                precisionclassvec = precisionclass
                f1classvec = f1class
            else:
                recallclassvec = np.vstack([recallclassvec, recallclass])
                precisionclassvec = np.vstack(
                    [precisionclassvec, precisionclass])
                f1classvec = np.vstack([f1classvec, f1class])
        mean, se = np.mean(recallvec), st.sem(recallvec)
        meanprecision, seprecision = np.mean(precisionvec), st.sem(
            precisionvec)
        meanf1, sef1 = np.mean(f1vec), st.sem(f1vec)
        h = se * t._ppf((1 + 0.95) / 2., len(recallvec) - 1)
        hprecision = seprecision * t._ppf((1 + 0.95) / 2.,
                                          len(precisionvec) - 1)
        hf1 = sef1 * t._ppf((1 + 0.95) / 2., len(f1vec) - 1)
        if (meanf1 > savemean):
            savemean = meanf1
            savemodel = "Model " + str(nummodel)
        for i in range(0, nClasses):
            meanclass[i] = np.mean(recallclassvec[:, i])
            seclass[i] = st.sem(recallclassvec[:, i])
            meanf1class[i] = np.mean(f1classvec[:, i])
            sef1class[i] = st.sem(f1classvec[:, i])
            meanprecisionclass[i] = np.mean(precisionclassvec[:, i])
            seprecisionclass[i] = st.sem(precisionclassvec[:, i])
            hclass[i] = seclass[i] * \
                t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1)
            hprecisionclass[i] = seprecisionclass[i] \
                * t._ppf((1+0.95)/2.,
                         len(precisionclassvec[:, i])-1)
            hf1class[i] = sef1class[i] * \
                t._ppf((1+0.95)/2., len(f1classvec[:, i])-1)

        print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" %
              (nummodel, modelstring, mean, h, meanprecision, hprecision,
               meanf1, hf1))
        for i in range(nClasses):
            print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" %
                  (uniqClasses[i], modelstring, meanclass[i], hclass[i],
                   meanprecisionclass[i], hprecisionclass[i], meanf1class[i],
                   hf1class[i]))
        print('')
    print('')
    print("########best nearest neighbors model##########")
    print(savemodel)
    print("")
Beispiel #14
0
def crossvalidateGTC(data,
                     labels,
                     n_neighbors=1,
                     representation="modes",
                     niter=200,
                     k=0,
                     m=0,
                     doPCA=False,
                     n_components=-1,
                     missing=False,
                     missing_strategy='most_frequent',
                     random_state=1234,
                     predict_mode="bayes",
                     prior="equiprobable",
                     l=-1.0,
                     s=-1.0,
                     n_folds=5,
                     n_repetitions=10):
    print("")
    print("k = sqrt(grid size), m = sqrt(radial basis function grid size), "
          "l = regularization, s = RBF width factor")
    print("")
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    nClasses = len(uniqClasses)
    print("Classes: ", uniqClasses)
    print("nClasses: %s" % (nClasses))
    print("")
    print("model\tparameters=k:m:s:l\t"
          "recall with CI\tprecision with CI\tF1-score with CI")
    print("")
    if k == 0:
        k = int(math.sqrt(5 * math.sqrt(data.shape[0]))) + 2
    if m == 0:
        m = int(math.sqrt(k))
    if n_components == -1 and doPCA:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(),
                                       0.8) + 1
        print("Used number of components explaining 80%% of "
              "the variance in whole data set = %s\n" % n_components)
    if l < 0.0:
        lvec = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    else:
        lvec = [l]
    if s < 0.0:
        svec = [0.25, 0.5, 1.0, 1.50, 2.0]
    else:
        svec = [s]
    savemean = -9999
    nummodel = 0
    savemodel = ""
    for s in svec:
        for l in lvec:
            modelstring = str(k) + ':' + str(m) + ":" + str(s) + ":" + str(l)
            nummodel += 1
            recallvec = []
            precisionvec = []
            f1vec = []
            recallclassvec = np.array([])
            precisionclassvec = np.array([])
            f1classvec = np.array([])
            meanclass = np.zeros(nClasses)
            meanprecisionclass = np.zeros(nClasses)
            meanf1class = np.zeros(nClasses)
            seclass = np.zeros(nClasses)
            seprecisionclass = np.zeros(nClasses)
            sef1class = np.zeros(nClasses)
            hclass = np.zeros(nClasses)
            hprecisionclass = np.zeros(nClasses)
            hf1class = np.zeros(nClasses)
            for j in range(n_repetitions):
                ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
                y_true = []
                y_pred = []
                for train_index, test_index in ss.split(data):
                    train = np.copy(data[train_index])
                    test = np.copy(data[test_index])
                    prediction = ugtm_predictions.GTC(
                        train=train,
                        labels=labels[train_index],
                        test=test,
                        k=k,
                        m=m,
                        s=s,
                        l=l,
                        n_neighbors=n_neighbors,
                        niter=niter,
                        representation=representation,
                        doPCA=doPCA,
                        n_components=n_components,
                        random_state=random_state,
                        missing=missing,
                        missing_strategy=missing_strategy,
                        predict_mode=predict_mode,
                        prior=prior)
                    y_true = np.append(y_true, labels[test_index])
                    y_pred = np.append(y_pred, prediction)
                recall = recall_score(y_true, y_pred, average='weighted')
                precision = precision_score(y_true, y_pred, average='weighted')
                f1 = f1_score(y_true, y_pred, average='weighted')
                recallvec = np.append(recallvec, recall)
                precisionvec = np.append(precisionvec, precision)
                f1vec = np.append(f1vec, f1)
                recallclass = recall_score(y_true, y_pred, average=None)
                precisionclass = precision_score(y_true, y_pred, average=None)
                f1class = f1_score(y_true, y_pred, average=None)
                if (j == 0):
                    recallclassvec = recallclass
                    precisionclassvec = precisionclass
                    f1classvec = f1class
                else:
                    recallclassvec = np.vstack([recallclassvec, recallclass])
                    precisionclassvec = np.vstack(
                        [precisionclassvec, precisionclass])
                    f1classvec = np.vstack([f1classvec, f1class])
            mean, se = np.mean(recallvec), st.sem(recallvec)
            meanprecision, seprecision = np.mean(precisionvec), st.sem(
                precisionvec)
            meanf1, sef1 = np.mean(f1vec), st.sem(f1vec)
            h = se * t._ppf((1 + 0.95) / 2., len(recallvec) - 1)
            hprecision = seprecision * \
                t._ppf((1+0.95)/2., len(precisionvec)-1)
            hf1 = sef1 * t._ppf((1 + 0.95) / 2., len(f1vec) - 1)
            if (meanf1 > savemean):
                savemean = meanf1
                savemodel = "Model " + str(nummodel)
            for i in range(0, nClasses):
                meanclass[i] = np.mean(recallclassvec[:, i])
                seclass[i] = st.sem(recallclassvec[:, i])
                meanf1class[i] = np.mean(f1classvec[:, i])
                sef1class[i] = st.sem(f1classvec[:, i])
                meanprecisionclass[i] = np.mean(precisionclassvec[:, i])
                seprecisionclass[i] = st.sem(precisionclassvec[:, i])
                hclass[i] = seclass[i] * \
                    t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1)
                hprecisionclass[i] = seprecisionclass[i] \
                    * t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1)
                hf1class[i] = sef1class[i] * \
                    t._ppf((1+0.95)/2., len(f1classvec[:, i])-1)
            print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" %
                  (nummodel, modelstring, mean, h, meanprecision, hprecision,
                   meanf1, hf1))
            for i in range(nClasses):
                print(
                    "Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
                    % (uniqClasses[i], modelstring, meanclass[i], hclass[i],
                       meanprecisionclass[i], hprecisionclass[i],
                       meanf1class[i], hf1class[i]))
            print('')

    print('')
    print("########best GTC model##########")
    print(savemodel)
    print("")
Beispiel #15
0
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), sp.stats.sem(a)
    h = se * t._ppf((1 + confidence) / 2., n - 1)
    return m, h
Beispiel #16
0
def crossvalidateSVCrbf(data,
                        labels,
                        doPCA=False,
                        n_components=-1,
                        missing=False,
                        missing_strategy='most_frequent',
                        random_state=1234,
                        C=1,
                        gamma=1,
                        n_folds=5,
                        n_repetitions=10):

    if C < 0.0:
        Cvec = np.power(2, np.arange(start=-5, stop=15, step=1,
                                     dtype=np.float))
    else:
        Cvec = [C]
    if gamma < 0.0:
        gvec = np.power(2.0,
                        np.arange(start=-15, stop=3, step=1, dtype=np.float))
    else:
        gvec = [gamma]
    modelvec = ""
    savemean = -9999.0
    saveh = 0.0
    nummodel = 0
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(),
                                       0.8) + 1
        print("Used number of components explaining 80%% "
              "of the variance = %s\n" % n_components)
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    nClasses = len(uniqClasses)
    print("Classes: ", uniqClasses)
    print("nClasses: ", nClasses)
    print("")
    print("model\tparameters=C:gamma\trecall with CI\t"
          "precision with CI\tF1-score with CI")
    print("")
    for C in Cvec:
        for g in gvec:
            modelstring = str(C) + "-" + str(g)
            nummodel += 1
            recallvec = []
            precisionvec = []
            f1vec = []
            recallclassvec = np.array([])
            precisionclassvec = np.array([])
            f1classvec = np.array([])
            meanclass = np.zeros(nClasses)
            meanprecisionclass = np.zeros(nClasses)
            meanf1class = np.zeros(nClasses)
            seclass = np.zeros(nClasses)
            seprecisionclass = np.zeros(nClasses)
            sef1class = np.zeros(nClasses)
            hclass = np.zeros(nClasses)
            hprecisionclass = np.zeros(nClasses)
            hf1class = np.zeros(nClasses)
            for j in range(n_repetitions):
                ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
                y_true = []
                y_pred = []
                for train_index, test_index in ss.split(data):
                    train = np.copy(data[train_index])
                    test = np.copy(data[test_index])
                    processed = ugtm_preprocess.processTrainTest(
                        train, test, doPCA, n_components, missing,
                        missing_strategy)
                    clf = SVC(kernel='rbf', C=C, gamma=g)
                    clf.fit(processed.train, labels[train_index])
                    y_pred = np.append(y_pred, clf.predict(processed.test))
                    y_true = np.append(y_true, labels[test_index])
                recall = recall_score(y_true, y_pred, average='weighted')
                precision = precision_score(y_true, y_pred, average='weighted')
                f1 = f1_score(y_true, y_pred, average='weighted')
                recallvec = np.append(recallvec, recall)
                precisionvec = np.append(precisionvec, precision)
                f1vec = np.append(f1vec, f1)
                recallclass = recall_score(y_true, y_pred, average=None)
                precisionclass = precision_score(y_true, y_pred, average=None)
                f1class = f1_score(y_true, y_pred, average=None)
                if (j == 0):
                    recallclassvec = recallclass
                    precisionclassvec = precisionclass
                    f1classvec = f1class
                else:
                    recallclassvec = np.vstack([recallclassvec, recallclass])
                    precisionclassvec = np.vstack(
                        [precisionclassvec, precisionclass])
                    f1classvec = np.vstack([f1classvec, f1class])
            mean, se = np.mean(recallvec), st.sem(recallvec)
            meanprecision, seprecision = np.mean(precisionvec), st.sem(
                precisionvec)
            meanf1, sef1 = np.mean(f1vec), st.sem(f1vec)
            h = se * t._ppf((1 + 0.95) / 2., len(recallvec) - 1)
            hprecision = seprecision * \
                t._ppf((1+0.95)/2., len(precisionvec)-1)
            hf1 = sef1 * t._ppf((1 + 0.95) / 2., len(f1vec) - 1)
            if (meanf1 > savemean):
                savemean = meanf1
                saveh = hf1
                modelvec = modelstring
                savemodel = "Model " + str(nummodel)
            for i in range(0, nClasses):
                meanclass[i], seclass[i] = np.mean(recallclassvec[:, i]), \
                    st.sem(recallclassvec[:, i])
                meanf1class[i], sef1class[i] = np.mean(f1classvec[:, i]), \
                    st.sem(f1classvec[:, i])
                meanprecisionclass[i] = np.mean(precisionclassvec[:, i])
                seprecisionclass[i] = st.sem(precisionclassvec[:, i])
                hclass[i] = seclass[i] * \
                    t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1)
                hprecisionclass[i] = seprecisionclass[i] * \
                    t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1)
                hf1class[i] = sef1class[i] * \
                    t._ppf((1+0.95)/2., len(f1classvec[:, i])-1)
            print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" %
                  (nummodel, modelstring, mean, h, meanprecision, hprecision,
                   meanf1, hf1))
            for i in range(nClasses):
                print(
                    "Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
                    % (uniqClasses[i], modelstring, meanclass[i], hclass[i],
                       meanprecisionclass[i], hprecisionclass[i],
                       meanf1class[i], hf1class[i]))
            print("")
    print("")

    print("########best RBF SVM model##########")
    print(savemodel)
    print("")
Beispiel #17
0
def crossvalidateSVR(data,
                     labels,
                     doPCA=False,
                     n_components=-1,
                     missing=False,
                     missing_strategy='most_frequent',
                     random_state=1234,
                     C=-1,
                     epsilon=-1,
                     n_folds=5,
                     n_repetitions=10):
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    if C < 0.0:
        Cvec = np.power(2, np.arange(start=-5, stop=15, step=1,
                                     dtype=np.float))
    else:
        Cvec = [C]
    if epsilon < 0.0:
        EpsVec = [0, 0.01, 0.1, 0.5, 1, 2, 4]
    else:
        EpsVec = [epsilon]
    modelvec = ""
    savemean = 99999
    saveh = 0.0
    savemeanr2 = 0.0
    savehr2 = 0.0
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(),
                                       0.8) + 1
        print("Used number of components explaining 80%%"
              "of the variance = %s\n" % n_components)
    print("C:epsilon\tRMSE with CI\tR2 with CI\t")
    for C in Cvec:
        for eps in EpsVec:
            modelstring = str(C) + ":" + str(eps)
            rmsevec = []
            r2vec = []
            for j in range(n_repetitions):
                ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
                y_true = []
                y_pred = []
                for train_index, test_index in ss.split(data):
                    train = np.copy(data[train_index])
                    test = np.copy(data[test_index])
                    processed = ugtm_preprocess.processTrainTest(
                        train, test, doPCA, n_components, missing,
                        missing_strategy)
                    clf = SVR(kernel='linear', C=C, epsilon=eps)
                    clf.fit(processed.train, labels[train_index])
                    y_pred = np.append(y_pred, clf.predict(processed.test))
                    y_true = np.append(y_true, labels[test_index])
                rmse = math.sqrt(mean_squared_error(y_true, y_pred))
                r2 = r2_score(y_true, y_pred)
                rmsevec = np.append(rmsevec, rmse)
                r2vec = np.append(r2vec, r2)
            mean, se = np.mean(rmsevec), st.sem(rmsevec)
            h = se * t._ppf((1 + 0.95) / 2., len(rmsevec) - 1)
            meanr2, ser2 = np.mean(r2vec), st.sem(r2vec)
            hr2 = ser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1)
            if (mean < savemean):
                savemean = mean
                saveh = h
                modelvec = modelstring
                savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec)
                savehr2 = saveser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1)
            print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" %
                  (modelstring, mean, h, meanr2, hr2))
    print('')
    print("########best linear SVM model##########")
    print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" %
          (modelvec, savemean, saveh, savemeanr2, savehr2))
    print("")
Beispiel #18
0
def crossvalidatePCAR(data,
                      labels,
                      doPCA=False,
                      n_components=-1,
                      missing=False,
                      missing_strategy='most_frequent',
                      random_state=1234,
                      n_neighbors=1,
                      n_folds=5,
                      n_repetitions=10,
                      maxneighbours=11):
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(),
                                       0.8) + 1
        print(
            "Used number of components explaining 80%% of the variance = %s\n"
            % n_components)
    print("")
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    if n_neighbors <= 0:
        Kvec = np.arange(start=1, stop=maxneighbours, step=1, dtype=np.int32)
    else:
        Kvec = [n_neighbors]

    modelvec = ""
    savemean = 99999
    saveh = 0.0
    savemeanr2 = 0.0
    savehr2 = 0.0
    nummodel = 0
    print("k = number of nearest neighbours\tRMSE with CI\tR2 with CI\t")
    for c in Kvec:
        nummodel += 1
        modelstring = str(c)
        rmsevec = []
        r2vec = []
        for j in range(n_repetitions):
            ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
            y_true = []
            y_pred = []
            for train_index, test_index in ss.split(data):
                train = np.copy(data[train_index])
                test = np.copy(data[test_index])
                processed = ugtm_preprocess.processTrainTest(
                    train, test, doPCA, n_components, missing,
                    missing_strategy)
                y_pred = np.append(
                    y_pred,
                    ugtm_predictions.predictNNSimple(processed.train,
                                                     processed.test,
                                                     labels[train_index], c,
                                                     "regression"))
                y_true = np.append(y_true, labels[test_index])
            rmse = math.sqrt(mean_squared_error(y_true, y_pred))
            r2 = r2_score(y_true, y_pred)
            rmsevec = np.append(rmsevec, rmse)
            r2vec = np.append(r2vec, r2)
        mean, se = np.mean(rmsevec), st.sem(rmsevec)
        h = se * t._ppf((1 + 0.95) / 2., len(rmsevec) - 1)
        meanr2, ser2 = np.mean(r2vec), st.sem(r2vec)
        hr2 = ser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1)
        if (mean < savemean):
            savemean = mean
            saveh = h
            modelvec = modelstring
            savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec)
            savehr2 = saveser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1)
        print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" %
              (modelstring, mean, h, meanr2, hr2))
    print('')
    print("########best nearest neighbors model##########")
    print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" %
          (modelvec, savemean, saveh, savemeanr2, savehr2))
    print("")