Exemple #1
0
def stats(modelgrid,
          inventory,
          dx=100.,
          Nsamp=None,
          method='nearest',
          extent='inventory',
          bins=None,
          runtests=True,
          showplots=True,
          saveplots=False,
          filepath=None):
    """Run through suite of tests for models that output probability or index that varies between 0 and 1

    :param modelgrid: Grid2D object of model results
    :param inventory: full file path to shapefile of inventory, must be in geographic coordinates, WGS84
    :type inventory: string
    :param dx: Approximate sample spacing in meters, overwritten if Nsamp is defined
    :type dx: float
    :param Nsamp: Total number of samples desired - will choose optimal dx to get slightly more than this number of samples delete samples outside bounds and randomly delete others until sample number is exactly Nsamp
    :type Nsamp: integer
    :param method: method used for interp2d when transforming sampled model values back from projected coordinates to geographic coordinates - 'nearest', 'linear', or 'cubic'
    :type method: string
    :param extent: extent to include in sampling - 'inventory' or 'model' or custom bounds as tuple (xmin, ymin, xmax, ymax) - in lats and lons
    :param bins: bin edges to use for various binning and threshold statistical calculations. if None bins = [0, 0.2, 0.4, 0.6, 0.8, 1.]
    :param runtests: if True, will run various statistical tests, if False will just output sampled values
    :param showplots: if True, will disply the plots
    :param saveplots: if True, will save the plots
    :param filepath: Filepath for saved plots, if None, will save in current directory. Files are named with test name and time stamp
    :returns:
        * yespoints: Nx2 array of geographic coordinates of positive sample locations
        * nopoints: Nx2 array of geographic coordinates of negative sample locations
        * modelvalyes: N model output values corresponding to yespoints
        * modelvalno: N model output values corresponding to nopoints
        * results: dictionary of results of statistical tests. Will be empty if runtests=False
        {'Occ_nonocc': dict,
        'SRC': dict,
        'ROC': dict,
        'AUC_ROC': float,
        'Log_loss': float,
        'GFC': dict,
        'Pred_vs_Obs': dict,
        'Brier': float,
        'Brier_no': float,
        'Brier_yes': float}
    """
    plt.close('all')
    f = fiona.collection(inventory, 'r')
    shapes = list(f)
    bxmin, bymin, bxmax, bymax = f.bounds
    gdict = modelgrid.getGeoDict()

    if extent == 'model':
        extent = gdict.xmin, gdict.ymin, gdict.xmax, gdict.ymax
    elif extent == 'inventory':
        extent = bxmin, bymin, bxmax, bymax
    #yespoints, junk, nopoints, junk2, xvar, yvar, pshapes, proj = sampleFromShapes(shapes, extent, dx=dx, Nsamp=Nsamp, testPercent=100.)

    yespoints, nopoints, xvar, yvar, pshapes, proj = pointsFromShapes(
        shapes, extent, dx=dx, Nsamp=Nsamp)
    yesptx = [pt[0] for pt in yespoints]
    yespty = [pt[1] for pt in yespoints]
    noptx = [pt[0] for pt in nopoints]
    nopty = [pt[1] for pt in nopoints]
    #import pdb; pdb.set_trace()
    # Get values of model at those points
    lons = np.linspace(gdict.xmin, gdict.xmax, gdict.nx)
    lats = np.linspace(gdict.ymax, gdict.ymin, gdict.ny)
    if method.lower() == 'nearest':
        modelvalyes = []
        modelvalno = []
        for XX, YY in zip(yesptx, yespty):
            row = (np.abs(lats - YY)).argmin()
            col = (np.abs(lons - XX)).argmin()
            modelvalyes.append(modelgrid.getData()[row, col])
        for XX, YY in zip(noptx, nopty):
            row = (np.abs(lats - YY)).argmin()
            col = (np.abs(lons - XX)).argmin()
            modelvalno.append(modelgrid.getData()[row, col])
    else:
        func = interpolate.interp2d(lons,
                                    lats,
                                    modelgrid.getData(),
                                    kind=method.lower())
        modelvalyes = np.array(
            [float(func(XX, YY)) for XX, YY in zip(yesptx, yespty)])
        modelvalno = np.array(
            [float(func(XX, YY)) for XX, YY in zip(noptx, nopty)])

    modelvalyes = np.nan_to_num(
        np.array(modelvalyes))  # replace nan with zeros
    modelvalno = np.nan_to_num(np.array(modelvalno))  # replace nan with zeros

    # Now run the desired tests and make the desired plots
    results = {}

    if runtests is True:
        # Brier score
        N = len(yespoints) + len(nopoints)
        yessum = np.sum([(val - 1)**2 for val in modelvalyes])
        nosum = np.sum([(val)**2 for val in modelvalno])
        results['Brier_yes'] = yessum / len(modelvalyes)
        results['Brier_no'] = nosum / len(modelvalno)
        results['Brier'] = (yessum + nosum) / N
        print((
            'Brier scores: overall %0.3f\nBrier_yes score: %0.3f\nBrier_no score %0.3f'
            % (results['Brier'], results['Brier_yes'], results['Brier_no'])))

        # Logarithmic score
        tempno = np.array(modelvalno).copy()
        tempyes = np.array(modelvalyes).copy()
        tempno[tempno == 0] = 1.e-15
        tempyes[tempyes == 0] = 1.e-15
        results['Log_loss'] = -(np.sum(np.log(tempyes)) +
                                np.sum(np.log(1. - tempno))) / N
        print(('Log loss score: %0.3f' % (results['Log_loss'], )))

        if bins is None:
            bins = [0, 0.2, 0.4, 0.6, 0.8, 1.]
        binvec = []
        observed = []
        percyes = []
        percno = []
        overall_tot = len(modelvalyes) + len(modelvalno)
        for i in range(len(bins[:-1])):
            binvec.append(bins[i] + (bins[i + 1] - bins[i]) / 2)
            yestot = np.sum([
                (modelvalyes > bins[i]) & (modelvalyes < bins[i + 1])
            ])
            notot = np.sum([(modelvalno > bins[i]) & (modelvalno < bins[i + 1])
                            ])
            if notot + yestot != 0:
                observed.append(float(yestot) / (yestot + notot))
            else:
                observed.append('nan')
            percyes.append((yestot / float(overall_tot)) * 100.)
            percno.append((notot / float(overall_tot)) * 100.)

        plt.ioff()

        # Predicted vs. Observed ratios
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(binvec, observed, '-o')
        ax.plot([0] + binvec, [0] + binvec, '--', color='gray')
        ax.set_xlabel('Expected ratio')
        ax.set_ylabel('Observed ratio')
        ax.set_xlim([bins[0], bins[-1]])
        ax.set_title('Predicted vs. Observed')
        results['Pred_vs_Obs'] = {'binvec': binvec, 'observed': observed}

        # Ground failure occurrence/nonoccurrence
        fig1 = plt.figure()
        ax1 = fig1.add_subplot(111)
        wid = (bins[1] - bins[0]) / 2.5
        rects1 = ax1.bar(np.array(bins[:-1]), percyes, width=wid)
        rects2 = ax1.bar(np.array(bins[:-1]) + wid,
                         percno,
                         width=wid,
                         color='r')
        ax1.set_xlabel('Predicted susceptibility range')
        ax1.set_ylabel('% of samples')
        ax1.legend((rects1[0], rects2[0]), ('Occurrence', 'Nonoccurrence'))
        ax1.set_title('Occurrence vs. Nonoccurrence')
        results['Occ_nonocc'] = {
            'bins': bins,
            'percyes': percyes,
            'percno': percno
        }

        # Ground failure capture for various thresholds
        gfc = []
        for val in bins:
            gfc.append(np.sum([modelvalyes > val]) / float(len(yespoints)))
        fig2 = plt.figure()
        ax2 = fig2.add_subplot(111)
        ax2.plot(bins, gfc, 'o-')
        ax2.set_xlabel('Threshold')
        ax2.set_ylabel(r'%GFC')
        ax2.set_title('Ground Failure Capture')
        results['GFC'] = {'thresholds': bins, 'gfc': gfc}

        # ROC curves
        fpr, tpr, thresholds = roc_curve(
            np.concatenate((np.ones(len(yespoints)), np.zeros(len(nopoints)))),
            np.concatenate((modelvalyes, modelvalno)))
        fig3 = plt.figure()
        ax3 = fig3.add_subplot(111)
        ax3.plot(fpr, tpr)
        ax3.set_xlabel('False positive rate')
        ax3.set_ylabel('True positive rate')
        ax3.set_xlim([0, 1.])
        ax3.set_ylim([0, 1.])
        ax3.plot(fpr, fpr, '--', color='gray')
        ax3.set_title('ROC curve')
        results['ROC'] = {'thresholds': bins, 'gfc': gfc}

        results['AUC_ROC'] = roc_auc_score(
            np.concatenate((np.ones(len(yespoints)), np.zeros(len(nopoints)))),
            np.concatenate((modelvalyes, modelvalno)))
        print(('AUC_ROC: %0.3f' % (results['AUC_ROC'], )))
        ax3.text(0.8, 0.2, 'AUC: %0.3f' % results['AUC_ROC'])

        # Success rate curves
        sucbin = np.linspace(0, 1., 100)
        prop = []
        realvals = np.concatenate(
            (np.ones(len(yespoints)), np.zeros(len(nopoints))))
        predvals = np.concatenate((modelvalyes, modelvalno))
        indx = np.argsort(predvals)
        predvals = predvals[indx]
        realvals = realvals[indx]
        for val in sucbin:
            prop.append(np.sum(realvals[predvals < val]) / len(yespoints))
        fig4 = plt.figure()
        ax4 = fig4.add_subplot(111)
        ax4.plot(sucbin, prop)
        ax4.set_xlabel('Success Rate Curve')
        ax4.set_ylabel('Proportion of actual occurrences')
        ax4.set_title('Proportion of Study Area')
        AUC = auc(sucbin, prop)
        print(('AUC_SRC: %0.3f' % AUC))
        ax4.text(0.8, 0.2, 'AUC: %0.3f' % AUC)
        ax4.set_xlim([0, 1.])
        ax4.set_ylim([0, 1.])
        results['SRC'] = {'xvals': sucbin, 'proportion': prop, 'auc': AUC}

        if showplots is True:
            plt.show()
        if saveplots is True:
            if filepath is None:
                filepath = os.getcwd()
            import datetime
            time1 = datetime.datetime.utcnow().strftime('%d%b%Y_%H%M')
            fig.savefig(
                os.path.join(filepath, 'Pred_vs_obs_%s.pdf' % (time1, )))
            fig1.savefig(
                os.path.join(filepath, 'Occ_nonocc_%s.pdf' % (time1, )))
            fig2.savefig(os.path.join(filepath, 'GFC_%s.pdf' % (time1, )))
            fig3.savefig(os.path.join(filepath, 'ROC_%s.pdf' % (time1, )))
            fig4.savefig(os.path.join(filepath, 'SRC_%s.pdf' % (time1, )))

    return yespoints, nopoints, modelvalyes, modelvalno, results