def ppf(self, pits, parameters=None): self._check_constraints(parameters) scalar = np.isscalar(pits) if scalar: pits = np.array([pits]) eta, lam = parameters a = self.__const_a(parameters) b = self.__const_b(parameters) cond = pits < (1 - lam) / 2 # slight speed up for really large problems icdf1 = t._ppf(pits[cond] / (1 - lam), eta) icdf2 = t._ppf(.5 + (pits[~cond] - (1 - lam) / 2) / (1 + lam), eta) icdf = -999.99 * np.ones_like(pits) icdf[cond] = icdf1 icdf[~cond] = icdf2 icdf = (icdf * (1 + np.sign(pits - (1 - lam) / 2) * lam) * (1 - 2 / eta)**.5 - a) icdf = icdf / b if scalar: icdf = icdf[0] return icdf
def mean_confidence_interval(data, confidence=0.95): a = 100.0 * np.array(data) n = len(a) m, se = np.mean(a), scipy.stats.sem(a) h = se * t._ppf((1+confidence)/2., n-1) m = np.round(m, 3) h = np.round(h, 3) return m, h
def confidenceinterval(x: List[float], conf_level=0.95): """ 获得置信区间 """ a = 1.0 * np.array(x) n = len(a) m, se = np.mean(a), sem(a) h = se * t._ppf((1 + conf_level) / 2., n - 1) return m - h, m + h
def mean_confidence_interval(data, confidence=0.95): """ Given a list or vector of data, this returns the mean, lower, and upper confidence intervals to the level of confidence specified (default = 95% confidence interval). """ a = 1.0*np.array(data) n = len(a) m, se = np.mean(a), sem(a) h = se * t._ppf((1+confidence)/2., n-1) return m, m-h, m+h
def mean_error(config, id_algo, id_inst, seeds): x=[] for kk in range(seeds): if shared_times[id_algo,id_inst+kk*len(config.instances)] >=0.0: x.append(shared_times[id_algo,id_inst+kk*len(config.instances)]) n=len(x) if n>=2: se=stats.sem(x) h = se*t._ppf((1+0.95)/2., n-1) return (np.mean(x),h,n) else: return (np.mean(x),-1,n)
def mean_error_boxes(config, id_algo, id_inst, seeds): x=[] for kk in range(seeds): if shared_boxes[id_algo,id_inst+kk*len(config.instances)] >=0.0: x.append(shared_boxes[id_algo,id_inst+kk*len(config.instances)]) n=len(x) if n>=2: se=stats.sem(x) h = se*t._ppf((1+0.95)/2., n-1) return (np.mean(x),h,n) else: return (np.mean(x),-1,n)
def jackknife_bias_correct(pairs,confidence=None,return_all=False, nan_remove=True,return_raw=False): ''' Return jackknife-bias-corrected estimate from estimate-nsamples pairs Pairs can be either a list of tuples, or a 2 x nestimates array. If 'confidence' is between 0 and 1, return the mean with lower and upper bounds at -/+ the confidence interval. If 'confidence' is None, return the mean and standard error. If 'return_all' is True, return the mean, standard error, number of points, and confidence interval size. ''' data = asarray(pairs) if nan_remove: data = data[isfinite(data)[:,0],:] y = data[:,0] x = 1./data[:,1] n = len(x) # Compute linear regression and standard error of intercept (slope,intercept,r,p,slope_se) = linregress(x,y) intercept_se = slope_se * sqrt(ss(x)/n) # Return mean and SE if no value is specified: if confidence is None: if return_all: if return_raw: np = data[:,1] max_n = max(np) raw_mean = mean(data[np==max_n,0]) return intercept, intercept_se, n, raw_mean else: return intercept, intercept_se, n else: return intercept, intercept_se # Otherwise return intercept with confidence else: t_int = t._ppf((1+confidence)/2,n-2) intercept_int = t_int * intercept_se if return_all: if return_raw: np = data[:,1] max_n = max(np) raw_mean = mean(data[np==max_n,0]) return intercept, intercept_se, n, intercept_int, raw_mean else: return intercept, intercept_se, n, intercept_int else: return intercept, intercept - intercept_int, intercept + intercept_int
def lowess(x, y, f=1./3., iter=3, confidence=0.95): """ Performs Lowess smoothing Code adapted from: https://gist.github.com/agramfort/850437 lowess(x, y, f=2./3., iter=3) -> yest Lowess smoother: Robust locally weighted regression. The lowess function fits a nonparametric regression curve to a scatterplot. The arrays x and y contain an equal number of elements; each pair (x[i], y[i]) defines a data point in the scatterplot. The function returns the estimated (smooth) values of y. The smoothing span is given by f. A larger value for f will result in a smoother curve. The number of robustifying iterations is given by iter. The function will run faster with a smaller number of iterations. .. todo:: double check that the confidence bounds are correct """ n = len(x) r = int(np.ceil(f*n)) h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)] w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0) w = (1 - w**3)**3 yest = np.zeros(n) delta = np.ones(n) for iteration in range(iter): for i in range(n): weights = delta * w[:, i] b = np.array([np.sum(weights*y), np.sum(weights*y*x)]) A = np.array([[np.sum(weights), np.sum(weights*x)], [np.sum(weights*x), np.sum(weights*x*x)]]) beta = linalg.solve(A, b) yest[i] = beta[0] + beta[1]*x[i] residuals = y - yest s = np.median(np.abs(residuals)) delta = np.clip(residuals / (6.0 * s), -1, 1) delta = (1 - delta**2)**2 h = np.zeros(n) for x_idx, x_val in enumerate(x): r2 = np.array([v*v for i, v in enumerate(residuals) if x[i] == x_val]) n = len(r2) se = sqrt(mean(r2)) / sqrt(len(r2)) h[x_idx] = se * t._ppf((1+confidence)/2., n-1) return yest, yest-h, yest+h
def mean_confidence_interval(a, confidence=0.95): """ Helper function for calculating a mean confidence interval Athena uses a random forest from Scikit-Learn to select the best parameters. Use this constructor to create a Selection class. Parameters ---------- a : ? Number of estimators (decision trees) the Random Forest will use. confidence : float Number of threads the Random Forest will utilize. """ from numpy import mean from scipy.stats import sem, t n = len(a) m, se = mean(a), sem(a) h = se * t._ppf((1 + confidence) / 2., n - 1) return m, m - h, m + h
def crossvalidateGTR(data, labels, n_neighbors=1, representation="modes", niter=200, k=0, m=0, doPCA=False, n_components=-1, missing=False, missing_strategy='most_frequent', random_state=1234, l=-1, s=-1, n_folds=5, n_repetitions=10): print("") print("k = sqrt(grid size), m = sqrt(radial basis function grid size), " "l = regularization, s = RBF width factor") print("") if k == 0: k = int(math.sqrt(5 * math.sqrt(data.shape[0]))) + 2 if m == 0: m = int(math.sqrt(k)) if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(), 0.8) + 1 print( "Used number of components explaining 80%% of the variance = %s\n" % n_components) if l < 0.0: lvec = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100] else: lvec = [l] if s < 0.0: svec = [0.25, 0.5, 1.0, 1.50, 2.0] else: svec = [s] savemean = 999999999 saveh = 0.0 modelvec = "" savemeanr2 = 0.0 savehr2 = 0.0 print("k:m:s:l\tRMSE with CI\tR2 with CI\t") for s in svec: for l in lvec: modelstring = str(s) + ":" + str(l) rmsevec = [] r2vec = [] for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) prediction = ugtm_predictions.GTR( train=train, labels=labels[train_index], test=test, k=k, m=m, s=s, l=l, n_neighbors=n_neighbors, niter=niter, representation=representation, doPCA=doPCA, n_components=n_components, random_state=random_state, missing=missing, missing_strategy=missing_strategy) y_pred = np.append(y_pred, prediction) y_true = np.append(y_true, labels[test_index]) rmse = math.sqrt(mean_squared_error(y_true, y_pred)) r2 = r2_score(y_true, y_pred) rmsevec = np.append(rmsevec, rmse) r2vec = np.append(r2vec, r2) mean, se = np.mean(rmsevec), st.sem(rmsevec) h = se * t._ppf((1.0 + 0.95) / 2., len(rmsevec) - 1) meanr2, ser2 = np.mean(r2vec), st.sem(r2vec) hr2 = ser2 * t._ppf((1.0 + 0.95) / 2., len(r2vec) - 1) if (mean < savemean): savemean = mean saveh = h modelvec = modelstring savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec) savehr2 = saveser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1) print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (str(k) + ':' + str(m) + ':' + modelstring, mean, h, meanr2, hr2)) print('') print("########best GTR model##########") print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (str(k) + ':' + str(m) + ':' + modelvec, savemean, saveh, savemeanr2, savehr2)) print("")
def mean_confidence_interval(data, confidence=0.95): a = 1.0 * numpy.array(data) mean, se = numpy.mean(a), stats.sem(a) h = se * t._ppf((1 + confidence) / 2., len(a) - 1) return mean, mean - h, mean + h
def mean_confidence_interval(x, confidence=0.95): a = np.array(x) * 1.0 mu, se = np.mean(a), scipy.stats.sem(a) me = se * t._ppf((1 + confidence) / 2., len(a) - 1) return mu, mu - me, mu + me
def crossvalidatePCAC(data, labels, doPCA=False, n_components=-1, missing=False, missing_strategy='most_frequent', random_state=1234, n_neighbors=1, n_folds=5, n_repetitions=10, maxneighbours=11): if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(), 0.8) + 1 print("Used number of components " "explaining 80%% of the variance = %s\n" % n_components) uniqClasses, labels = np.unique(labels, return_inverse=True) nClasses = len(uniqClasses) print("Classes: ", uniqClasses) print("nClasses: ", nClasses) print("") print("model\tparameters=k_for_kNN\trecall with CI\t" "precision with CI\tF1-score with CI") print("") if n_neighbors <= 0: Kvec = np.arange(start=1, stop=maxneighbours, step=1, dtype=np.int32) else: Kvec = [n_neighbors] savemean = -9999 nummodel = 0 savemodel = "" for c in Kvec: nummodel += 1 modelstring = str(c) recallvec = [] precisionvec = [] f1vec = [] recallclassvec = np.array([]) precisionclassvec = np.array([]) f1classvec = np.array([]) meanclass = np.zeros(nClasses) meanprecisionclass = np.zeros(nClasses) meanf1class = np.zeros(nClasses) seclass = np.zeros(nClasses) seprecisionclass = np.zeros(nClasses) sef1class = np.zeros(nClasses) hclass = np.zeros(nClasses) hprecisionclass = np.zeros(nClasses) hf1class = np.zeros(nClasses) for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest( train, test, doPCA, n_components, missing, missing_strategy) y_pred = np.append( y_pred, ugtm_predictions.predictNNSimple(processed.train, processed.test, labels[train_index], c, "classification")) y_true = np.append(y_true, labels[test_index]) recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score(y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') recallvec = np.append(recallvec, recall) precisionvec = np.append(precisionvec, precision) f1vec = np.append(f1vec, f1) recallclass = recall_score(y_true, y_pred, average=None) precisionclass = precision_score(y_true, y_pred, average=None) f1class = f1_score(y_true, y_pred, average=None) if (j == 0): recallclassvec = recallclass precisionclassvec = precisionclass f1classvec = f1class else: recallclassvec = np.vstack([recallclassvec, recallclass]) precisionclassvec = np.vstack( [precisionclassvec, precisionclass]) f1classvec = np.vstack([f1classvec, f1class]) mean, se = np.mean(recallvec), st.sem(recallvec) meanprecision, seprecision = np.mean(precisionvec), st.sem( precisionvec) meanf1, sef1 = np.mean(f1vec), st.sem(f1vec) h = se * t._ppf((1 + 0.95) / 2., len(recallvec) - 1) hprecision = seprecision * t._ppf((1 + 0.95) / 2., len(precisionvec) - 1) hf1 = sef1 * t._ppf((1 + 0.95) / 2., len(f1vec) - 1) if (meanf1 > savemean): savemean = meanf1 savemodel = "Model " + str(nummodel) for i in range(0, nClasses): meanclass[i] = np.mean(recallclassvec[:, i]) seclass[i] = st.sem(recallclassvec[:, i]) meanf1class[i] = np.mean(f1classvec[:, i]) sef1class[i] = st.sem(f1classvec[:, i]) meanprecisionclass[i] = np.mean(precisionclassvec[:, i]) seprecisionclass[i] = st.sem(precisionclassvec[:, i]) hclass[i] = seclass[i] * \ t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1) hprecisionclass[i] = seprecisionclass[i] \ * t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1) hf1class[i] = sef1class[i] * \ t._ppf((1+0.95)/2., len(f1classvec[:, i])-1) print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (nummodel, modelstring, mean, h, meanprecision, hprecision, meanf1, hf1)) for i in range(nClasses): print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (uniqClasses[i], modelstring, meanclass[i], hclass[i], meanprecisionclass[i], hprecisionclass[i], meanf1class[i], hf1class[i])) print('') print('') print("########best nearest neighbors model##########") print(savemodel) print("")
def crossvalidateGTC(data, labels, n_neighbors=1, representation="modes", niter=200, k=0, m=0, doPCA=False, n_components=-1, missing=False, missing_strategy='most_frequent', random_state=1234, predict_mode="bayes", prior="equiprobable", l=-1.0, s=-1.0, n_folds=5, n_repetitions=10): print("") print("k = sqrt(grid size), m = sqrt(radial basis function grid size), " "l = regularization, s = RBF width factor") print("") uniqClasses, labels = np.unique(labels, return_inverse=True) nClasses = len(uniqClasses) print("Classes: ", uniqClasses) print("nClasses: %s" % (nClasses)) print("") print("model\tparameters=k:m:s:l\t" "recall with CI\tprecision with CI\tF1-score with CI") print("") if k == 0: k = int(math.sqrt(5 * math.sqrt(data.shape[0]))) + 2 if m == 0: m = int(math.sqrt(k)) if n_components == -1 and doPCA: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(), 0.8) + 1 print("Used number of components explaining 80%% of " "the variance in whole data set = %s\n" % n_components) if l < 0.0: lvec = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100] else: lvec = [l] if s < 0.0: svec = [0.25, 0.5, 1.0, 1.50, 2.0] else: svec = [s] savemean = -9999 nummodel = 0 savemodel = "" for s in svec: for l in lvec: modelstring = str(k) + ':' + str(m) + ":" + str(s) + ":" + str(l) nummodel += 1 recallvec = [] precisionvec = [] f1vec = [] recallclassvec = np.array([]) precisionclassvec = np.array([]) f1classvec = np.array([]) meanclass = np.zeros(nClasses) meanprecisionclass = np.zeros(nClasses) meanf1class = np.zeros(nClasses) seclass = np.zeros(nClasses) seprecisionclass = np.zeros(nClasses) sef1class = np.zeros(nClasses) hclass = np.zeros(nClasses) hprecisionclass = np.zeros(nClasses) hf1class = np.zeros(nClasses) for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) prediction = ugtm_predictions.GTC( train=train, labels=labels[train_index], test=test, k=k, m=m, s=s, l=l, n_neighbors=n_neighbors, niter=niter, representation=representation, doPCA=doPCA, n_components=n_components, random_state=random_state, missing=missing, missing_strategy=missing_strategy, predict_mode=predict_mode, prior=prior) y_true = np.append(y_true, labels[test_index]) y_pred = np.append(y_pred, prediction) recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score(y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') recallvec = np.append(recallvec, recall) precisionvec = np.append(precisionvec, precision) f1vec = np.append(f1vec, f1) recallclass = recall_score(y_true, y_pred, average=None) precisionclass = precision_score(y_true, y_pred, average=None) f1class = f1_score(y_true, y_pred, average=None) if (j == 0): recallclassvec = recallclass precisionclassvec = precisionclass f1classvec = f1class else: recallclassvec = np.vstack([recallclassvec, recallclass]) precisionclassvec = np.vstack( [precisionclassvec, precisionclass]) f1classvec = np.vstack([f1classvec, f1class]) mean, se = np.mean(recallvec), st.sem(recallvec) meanprecision, seprecision = np.mean(precisionvec), st.sem( precisionvec) meanf1, sef1 = np.mean(f1vec), st.sem(f1vec) h = se * t._ppf((1 + 0.95) / 2., len(recallvec) - 1) hprecision = seprecision * \ t._ppf((1+0.95)/2., len(precisionvec)-1) hf1 = sef1 * t._ppf((1 + 0.95) / 2., len(f1vec) - 1) if (meanf1 > savemean): savemean = meanf1 savemodel = "Model " + str(nummodel) for i in range(0, nClasses): meanclass[i] = np.mean(recallclassvec[:, i]) seclass[i] = st.sem(recallclassvec[:, i]) meanf1class[i] = np.mean(f1classvec[:, i]) sef1class[i] = st.sem(f1classvec[:, i]) meanprecisionclass[i] = np.mean(precisionclassvec[:, i]) seprecisionclass[i] = st.sem(precisionclassvec[:, i]) hclass[i] = seclass[i] * \ t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1) hprecisionclass[i] = seprecisionclass[i] \ * t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1) hf1class[i] = sef1class[i] * \ t._ppf((1+0.95)/2., len(f1classvec[:, i])-1) print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (nummodel, modelstring, mean, h, meanprecision, hprecision, meanf1, hf1)) for i in range(nClasses): print( "Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (uniqClasses[i], modelstring, meanclass[i], hclass[i], meanprecisionclass[i], hprecisionclass[i], meanf1class[i], hf1class[i])) print('') print('') print("########best GTC model##########") print(savemodel) print("")
def mean_confidence_interval(data, confidence=0.95): a = 1.0 * np.array(data) n = len(a) m, se = np.mean(a), sp.stats.sem(a) h = se * t._ppf((1 + confidence) / 2., n - 1) return m, h
def crossvalidateSVCrbf(data, labels, doPCA=False, n_components=-1, missing=False, missing_strategy='most_frequent', random_state=1234, C=1, gamma=1, n_folds=5, n_repetitions=10): if C < 0.0: Cvec = np.power(2, np.arange(start=-5, stop=15, step=1, dtype=np.float)) else: Cvec = [C] if gamma < 0.0: gvec = np.power(2.0, np.arange(start=-15, stop=3, step=1, dtype=np.float)) else: gvec = [gamma] modelvec = "" savemean = -9999.0 saveh = 0.0 nummodel = 0 if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(), 0.8) + 1 print("Used number of components explaining 80%% " "of the variance = %s\n" % n_components) uniqClasses, labels = np.unique(labels, return_inverse=True) nClasses = len(uniqClasses) print("Classes: ", uniqClasses) print("nClasses: ", nClasses) print("") print("model\tparameters=C:gamma\trecall with CI\t" "precision with CI\tF1-score with CI") print("") for C in Cvec: for g in gvec: modelstring = str(C) + "-" + str(g) nummodel += 1 recallvec = [] precisionvec = [] f1vec = [] recallclassvec = np.array([]) precisionclassvec = np.array([]) f1classvec = np.array([]) meanclass = np.zeros(nClasses) meanprecisionclass = np.zeros(nClasses) meanf1class = np.zeros(nClasses) seclass = np.zeros(nClasses) seprecisionclass = np.zeros(nClasses) sef1class = np.zeros(nClasses) hclass = np.zeros(nClasses) hprecisionclass = np.zeros(nClasses) hf1class = np.zeros(nClasses) for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest( train, test, doPCA, n_components, missing, missing_strategy) clf = SVC(kernel='rbf', C=C, gamma=g) clf.fit(processed.train, labels[train_index]) y_pred = np.append(y_pred, clf.predict(processed.test)) y_true = np.append(y_true, labels[test_index]) recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score(y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') recallvec = np.append(recallvec, recall) precisionvec = np.append(precisionvec, precision) f1vec = np.append(f1vec, f1) recallclass = recall_score(y_true, y_pred, average=None) precisionclass = precision_score(y_true, y_pred, average=None) f1class = f1_score(y_true, y_pred, average=None) if (j == 0): recallclassvec = recallclass precisionclassvec = precisionclass f1classvec = f1class else: recallclassvec = np.vstack([recallclassvec, recallclass]) precisionclassvec = np.vstack( [precisionclassvec, precisionclass]) f1classvec = np.vstack([f1classvec, f1class]) mean, se = np.mean(recallvec), st.sem(recallvec) meanprecision, seprecision = np.mean(precisionvec), st.sem( precisionvec) meanf1, sef1 = np.mean(f1vec), st.sem(f1vec) h = se * t._ppf((1 + 0.95) / 2., len(recallvec) - 1) hprecision = seprecision * \ t._ppf((1+0.95)/2., len(precisionvec)-1) hf1 = sef1 * t._ppf((1 + 0.95) / 2., len(f1vec) - 1) if (meanf1 > savemean): savemean = meanf1 saveh = hf1 modelvec = modelstring savemodel = "Model " + str(nummodel) for i in range(0, nClasses): meanclass[i], seclass[i] = np.mean(recallclassvec[:, i]), \ st.sem(recallclassvec[:, i]) meanf1class[i], sef1class[i] = np.mean(f1classvec[:, i]), \ st.sem(f1classvec[:, i]) meanprecisionclass[i] = np.mean(precisionclassvec[:, i]) seprecisionclass[i] = st.sem(precisionclassvec[:, i]) hclass[i] = seclass[i] * \ t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1) hprecisionclass[i] = seprecisionclass[i] * \ t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1) hf1class[i] = sef1class[i] * \ t._ppf((1+0.95)/2., len(f1classvec[:, i])-1) print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (nummodel, modelstring, mean, h, meanprecision, hprecision, meanf1, hf1)) for i in range(nClasses): print( "Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (uniqClasses[i], modelstring, meanclass[i], hclass[i], meanprecisionclass[i], hprecisionclass[i], meanf1class[i], hf1class[i])) print("") print("") print("########best RBF SVM model##########") print(savemodel) print("")
def crossvalidateSVR(data, labels, doPCA=False, n_components=-1, missing=False, missing_strategy='most_frequent', random_state=1234, C=-1, epsilon=-1, n_folds=5, n_repetitions=10): uniqClasses, labels = np.unique(labels, return_inverse=True) if C < 0.0: Cvec = np.power(2, np.arange(start=-5, stop=15, step=1, dtype=np.float)) else: Cvec = [C] if epsilon < 0.0: EpsVec = [0, 0.01, 0.1, 0.5, 1, 2, 4] else: EpsVec = [epsilon] modelvec = "" savemean = 99999 saveh = 0.0 savemeanr2 = 0.0 savehr2 = 0.0 if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(), 0.8) + 1 print("Used number of components explaining 80%%" "of the variance = %s\n" % n_components) print("C:epsilon\tRMSE with CI\tR2 with CI\t") for C in Cvec: for eps in EpsVec: modelstring = str(C) + ":" + str(eps) rmsevec = [] r2vec = [] for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest( train, test, doPCA, n_components, missing, missing_strategy) clf = SVR(kernel='linear', C=C, epsilon=eps) clf.fit(processed.train, labels[train_index]) y_pred = np.append(y_pred, clf.predict(processed.test)) y_true = np.append(y_true, labels[test_index]) rmse = math.sqrt(mean_squared_error(y_true, y_pred)) r2 = r2_score(y_true, y_pred) rmsevec = np.append(rmsevec, rmse) r2vec = np.append(r2vec, r2) mean, se = np.mean(rmsevec), st.sem(rmsevec) h = se * t._ppf((1 + 0.95) / 2., len(rmsevec) - 1) meanr2, ser2 = np.mean(r2vec), st.sem(r2vec) hr2 = ser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1) if (mean < savemean): savemean = mean saveh = h modelvec = modelstring savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec) savehr2 = saveser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1) print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (modelstring, mean, h, meanr2, hr2)) print('') print("########best linear SVM model##########") print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (modelvec, savemean, saveh, savemeanr2, savehr2)) print("")
def crossvalidatePCAR(data, labels, doPCA=False, n_components=-1, missing=False, missing_strategy='most_frequent', random_state=1234, n_neighbors=1, n_folds=5, n_repetitions=10, maxneighbours=11): if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted(pca.explained_variance_ratio_.cumsum(), 0.8) + 1 print( "Used number of components explaining 80%% of the variance = %s\n" % n_components) print("") uniqClasses, labels = np.unique(labels, return_inverse=True) if n_neighbors <= 0: Kvec = np.arange(start=1, stop=maxneighbours, step=1, dtype=np.int32) else: Kvec = [n_neighbors] modelvec = "" savemean = 99999 saveh = 0.0 savemeanr2 = 0.0 savehr2 = 0.0 nummodel = 0 print("k = number of nearest neighbours\tRMSE with CI\tR2 with CI\t") for c in Kvec: nummodel += 1 modelstring = str(c) rmsevec = [] r2vec = [] for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest( train, test, doPCA, n_components, missing, missing_strategy) y_pred = np.append( y_pred, ugtm_predictions.predictNNSimple(processed.train, processed.test, labels[train_index], c, "regression")) y_true = np.append(y_true, labels[test_index]) rmse = math.sqrt(mean_squared_error(y_true, y_pred)) r2 = r2_score(y_true, y_pred) rmsevec = np.append(rmsevec, rmse) r2vec = np.append(r2vec, r2) mean, se = np.mean(rmsevec), st.sem(rmsevec) h = se * t._ppf((1 + 0.95) / 2., len(rmsevec) - 1) meanr2, ser2 = np.mean(r2vec), st.sem(r2vec) hr2 = ser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1) if (mean < savemean): savemean = mean saveh = h modelvec = modelstring savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec) savehr2 = saveser2 * t._ppf((1 + 0.95) / 2., len(r2vec) - 1) print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (modelstring, mean, h, meanr2, hr2)) print('') print("########best nearest neighbors model##########") print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (modelvec, savemean, saveh, savemeanr2, savehr2)) print("")