def stats(modelgrid, inventory, dx=100., Nsamp=None, method='nearest', extent='inventory', bins=None, runtests=True, showplots=True, saveplots=False, filepath=None): """ Run through suite of tests for models that output probability or index that varies between 0 and 1 :param modelgrid: Grid2D object of model results :param inventory: full file path to shapefile of inventory, must be in geographic coordinates, WGS84 :type inventory: string :param dx: Approximate sample spacing in meters, overwritten if Nsamp is defined :type dx: float :param Nsamp: Total number of samples desired - will choose optimal dx to get slightly more than this number of samples delete samples outside bounds and randomly delete others until sample number is exactly Nsamp :type Nsamp: integer :param method: method used for interp2d when transforming sampled model values back from projected coordinates to geographic coordinates - 'nearest', 'linear', or 'cubic' :type method: string :param extent: extent to include in sampling - 'inventory' or 'model' or custom bounds as tuple (xmin, ymin, xmax, ymax) - in lats and lons :param bins: bin edges to use for various binning and threshold statistical calculations. if None bins = [0, 0.2, 0.4, 0.6, 0.8, 1.] :param runtests: if True, will run various statistical tests, if False will just output sampled values :param showplots: if True, will disply the plots :param saveplots: if True, will save the plots :param filepath: Filepath for saved plots, if None, will save in current directory. Files are named with test name and time stamp :returns yespoints: Nx2 array of geographic coordinates of positive sample locations :returns nopoints: Nx2 array of geographic coordinates of negative sample locations :returns modelvalyes: N model output values corresponding to yespoints :returns modelvalno: N model output values corresponding to nopoints :returns results: dictionary of results of statistical tests. Will be empty if runtests=False {'Occ_nonocc': dict, 'SRC': dict, 'ROC': dict, 'AUC_ROC': float, 'Log_loss': float, 'GFC': dict, 'Pred_vs_Obs': dict, 'Brier': float, 'Brier_no': float, 'Brier_yes': float} :rtype results: dictionary """ plt.close('all') f = fiona.collection(inventory, 'r') shapes = list(f) bxmin, bymin, bxmax, bymax = f.bounds gdict = modelgrid.getGeoDict() if extent == 'model': extent = gdict.xmin, gdict.ymin, gdict.xmax, gdict.ymax elif extent == 'inventory': extent = bxmin, bymin, bxmax, bymax #yespoints, junk, nopoints, junk2, xvar, yvar, pshapes, proj = sampleFromShapes(shapes, extent, dx=dx, Nsamp=Nsamp, testPercent=100.) yespoints, nopoints, xvar, yvar, pshapes, proj = pointsFromShapes(shapes, extent, dx=dx, Nsamp=Nsamp) yesptx = [pt[0] for pt in yespoints] yespty = [pt[1] for pt in yespoints] noptx = [pt[0] for pt in nopoints] nopty = [pt[1] for pt in nopoints] #import pdb; pdb.set_trace() # Get values of model at those points lons = np.linspace(gdict.xmin, gdict.xmax, gdict.nx) lats = np.linspace(gdict.ymax, gdict.ymin, gdict.ny) if method.lower() == 'nearest': modelvalyes = [] modelvalno = [] for XX, YY in zip(yesptx, yespty): row = (np.abs(lats - YY)).argmin() col = (np.abs(lons - XX)).argmin() modelvalyes.append(modelgrid.getData()[row, col]) for XX, YY in zip(noptx, nopty): row = (np.abs(lats - YY)).argmin() col = (np.abs(lons - XX)).argmin() modelvalno.append(modelgrid.getData()[row, col]) else: func = interpolate.interp2d(lons, lats, modelgrid.getData(), kind=method.lower()) modelvalyes = np.array([float(func(XX, YY)) for XX, YY in zip(yesptx, yespty)]) modelvalno = np.array([float(func(XX, YY)) for XX, YY in zip(noptx, nopty)]) modelvalyes = np.nan_to_num(np.array(modelvalyes)) # replace nan with zeros modelvalno = np.nan_to_num(np.array(modelvalno)) # replace nan with zeros # Now run the desired tests and make the desired plots results = {} if runtests is True: # Brier score N = len(yespoints) + len(nopoints) yessum = np.sum([(val-1)**2 for val in modelvalyes]) nosum = np.sum([(val)**2 for val in modelvalno]) results['Brier_yes'] = yessum/len(modelvalyes) results['Brier_no'] = nosum/len(modelvalno) results['Brier'] = (yessum + nosum)/N print(('Brier scores: overall %0.3f\nBrier_yes score: %0.3f\nBrier_no score %0.3f' % (results['Brier'], results['Brier_yes'], results['Brier_no']))) # Logarithmic score tempno = np.array(modelvalno).copy() tempyes = np.array(modelvalyes).copy() tempno[tempno == 0] = 1.e-15 tempyes[tempyes == 0] = 1.e-15 results['Log_loss'] = -(np.sum(np.log(tempyes)) + np.sum(np.log(1.-tempno)))/N print(('Log loss score: %0.3f' % (results['Log_loss'],))) if bins is None: bins = [0, 0.2, 0.4, 0.6, 0.8, 1.] binvec = [] observed = [] percyes = [] percno = [] overall_tot = len(modelvalyes) + len(modelvalno) for i in range(len(bins[:-1])): binvec.append(bins[i]+(bins[i+1]-bins[i])/2) yestot = np.sum([(modelvalyes > bins[i]) & (modelvalyes < bins[i+1])]) notot = np.sum([(modelvalno > bins[i]) & (modelvalno < bins[i+1])]) if notot+yestot != 0: observed.append(float(yestot)/(yestot+notot)) else: observed.append('nan') percyes.append((yestot/float(overall_tot))*100.) percno.append((notot/float(overall_tot))*100.) plt.ioff() # Predicted vs. Observed ratios fig = plt.figure() ax = fig.add_subplot(111) ax.plot(binvec, observed, '-o') ax.plot([0]+binvec, [0]+binvec, '--', color='gray') ax.set_xlabel('Expected ratio') ax.set_ylabel('Observed ratio') ax.set_xlim([bins[0], bins[-1]]) ax.set_title('Predicted vs. Observed') results['Pred_vs_Obs'] = {'binvec': binvec, 'observed': observed} # Ground failure occurrence/nonoccurrence fig1 = plt.figure() ax1 = fig1.add_subplot(111) wid = (bins[1]-bins[0])/2.5 rects1 = ax1.bar(np.array(bins[:-1]), percyes, width=wid) rects2 = ax1.bar(np.array(bins[:-1])+wid, percno, width=wid, color='r') ax1.set_xlabel('Predicted susceptibility range') ax1.set_ylabel('% of samples') ax1.legend((rects1[0], rects2[0]), ('Occurrence', 'Nonoccurrence')) ax1.set_title('Occurrence vs. Nonoccurrence') results['Occ_nonocc'] = {'bins': bins, 'percyes': percyes, 'percno': percno} # Ground failure capture for various thresholds gfc = [] for val in bins: gfc.append(np.sum([modelvalyes > val])/float(len(yespoints))) fig2 = plt.figure() ax2 = fig2.add_subplot(111) ax2.plot(bins, gfc, 'o-') ax2.set_xlabel('Threshold') ax2.set_ylabel(r'%GFC') ax2.set_title('Ground Failure Capture') results['GFC'] = {'thresholds': bins, 'gfc': gfc} # ROC curves fpr, tpr, thresholds = roc_curve(np.concatenate((np.ones(len(yespoints)), np.zeros(len(nopoints)))), np.concatenate((modelvalyes, modelvalno))) fig3 = plt.figure() ax3 = fig3.add_subplot(111) ax3.plot(fpr, tpr) ax3.set_xlabel('False positive rate') ax3.set_ylabel('True positive rate') ax3.set_xlim([0, 1.]) ax3.set_ylim([0, 1.]) ax3.plot(fpr, fpr, '--', color='gray') ax3.set_title('ROC curve') results['ROC'] = {'thresholds': bins, 'gfc': gfc} results['AUC_ROC'] = roc_auc_score(np.concatenate((np.ones(len(yespoints)), np.zeros(len(nopoints)))), np.concatenate((modelvalyes, modelvalno))) print(('AUC_ROC: %0.3f' % (results['AUC_ROC'],))) ax3.text(0.8, 0.2, 'AUC: %0.3f' % results['AUC_ROC']) # Success rate curves sucbin = np.linspace(0, 1., 100) prop = [] realvals = np.concatenate((np.ones(len(yespoints)), np.zeros(len(nopoints)))) predvals = np.concatenate((modelvalyes, modelvalno)) indx = np.argsort(predvals) predvals = predvals[indx] realvals = realvals[indx] for val in sucbin: prop.append(np.sum(realvals[predvals < val])/len(yespoints)) fig4 = plt.figure() ax4 = fig4.add_subplot(111) ax4.plot(sucbin, prop) ax4.set_xlabel('Success Rate Curve') ax4.set_ylabel('Proportion of actual occurrences') ax4.set_title('Proportion of Study Area') AUC = auc(sucbin, prop) print(('AUC_SRC: %0.3f' % AUC)) ax4.text(0.8, 0.2, 'AUC: %0.3f' % AUC) ax4.set_xlim([0, 1.]) ax4.set_ylim([0, 1.]) results['SRC'] = {'xvals': sucbin, 'proportion': prop, 'auc': AUC} if showplots is True: plt.show() if saveplots is True: if filepath is None: filepath = os.getcwd() import datetime time1 = datetime.datetime.utcnow().strftime('%d%b%Y_%H%M') fig.savefig(os.path.join(filepath, 'Pred_vs_obs_%s.pdf' % (time1,))) fig1.savefig(os.path.join(filepath, 'Occ_nonocc_%s.pdf' % (time1,))) fig2.savefig(os.path.join(filepath, 'GFC_%s.pdf' % (time1,))) fig3.savefig(os.path.join(filepath, 'ROC_%s.pdf' % (time1,))) fig4.savefig(os.path.join(filepath, 'SRC_%s.pdf' % (time1,))) return yespoints, nopoints, modelvalyes, modelvalno, results
def computeCoverage_accurate(gdict, inventory, numdiv=10.): """ VERY SLOW!! Slow but more accurate method to produce grid of area actually affected by landsliding in each cell defined by geodict :param gdict: geodict, likely taken from model to compare inventory against :param inventory: full file path to shapefile of inventory, must be in geographic coordinates, WGS84 :type inventory: string :param numdiv: Approximate amount to subdivide each cell of geodict by to compute areas (higher number slower but more accurate) :return inventorygrid: Grid2D object reporting areal coverage of landsliding inside each cell defined by geodict """ f = fiona.collection(inventory, 'r') shapes = list(f) bxmin, bymin, bxmax, bymax = f.bounds lons = np.linspace(gdict.xmin, gdict.xmax, gdict.nx) lats = np.linspace(gdict.ymax, gdict.ymin, gdict.ny) llons, llats = np.meshgrid(lons, lats) spacing = np.round(np.abs(((lats[1]-lats[0])*111.12*1000.)/numdiv)) # in meters yespoints, nopoints, xvar, yvar, pshapes, proj = pointsFromShapes(shapes, bounds=(gdict.xmin, gdict.ymin, gdict.xmax, gdict.ymax), dx=spacing) # Loop over lat lon pairs that are within boundaries of yes and no points ptlonmax = (np.max((yespoints[:, 0].max(), nopoints[:, 0].max()))) ptlonmin = (np.max((yespoints[:, 0].min(), nopoints[:, 0].min()))) ptlatmax = (np.max((yespoints[:, 1].max(), nopoints[:, 1].max()))) ptlatmin = (np.max((yespoints[:, 1].min(), nopoints[:, 1].min()))) subllons = llons[(llons >= ptlonmin) & (llons <= ptlonmax) & (llats >= ptlatmin) & (llats <= ptlatmax)] subllats = llats[(llons >= ptlonmin) & (llons <= ptlonmax) & (llats >= ptlatmin) & (llats <= ptlatmax)] import time # Contains points method t1 = time.clock() dx = gdict.dx area = np.zeros(np.shape(llons)) numpts = area.copy() numyes = area.copy() for lat1, lon1 in zip(subllats, subllons): # Find ratio of yes points to no points bbPath = mplPath.Path(np.array([[lon1-0.5*dx, lat1-0.5*dx], [lon1-0.5*dx, lat1+0.5*dx], [lon1+0.5*dx, lat1+0.5*dx], [lon1+0.5*dx, lat1-0.5*dx]])) yesin = sum(bbPath.contains_points(yespoints)) # sum([(yes0 > lon1-0.5*dx) & (yes0 <= lon1+0.5*dx) & (yes1 > lat1-0.5*dx) & (yes1 <= lat1+0.5*dx)]) noin = sum(bbPath.contains_points(nopoints)) # sum([(no0 > lon1-0.5*dx) & (no0 <= lon1+0.5*dx) & (no1 > lat1-0.5*dx) & (no1 <= lat1+0.5*dx)]) total = yesin + noin if total == 0.: continue # get indices row = np.where(lats == lat1) col = np.where(lons == lon1) # Store total number of points in matrix numpts[row, col] = total # Store area numyes[row, col] = yesin t2 = time.clock() print(('Time elapsed %0.2f seconds' % (t2-t1))) # Correct for incompletely sampled squared (all unsampled points would be no points) numpts[numpts < (numpts[numpts != 0].mean() - numpts[numpts != 0].std())] = np.median(numpts[numpts != 0]) # Will change zeros to nonzeros, but yeses will be 0 in those cells so it doesn't matter area = numyes/numpts inventorygrid = GDALGrid(area, gdict) return inventorygrid