Exemple #1
0
    def run_diagnostic(self):

        # Read the observed and predicted files into numpy recarrays
        obs = utilities.csv2rec(self.observed_file)
        prd = utilities.csv2rec(self.predicted_file)

        # Subset the observed data just to the IDs that are in the
        # predicted file
        obs_keep = np.in1d(getattr(obs, self.id_field),
                           getattr(prd, self.id_field))
        obs = obs[obs_keep]

        # Calculate VEGCLASS for both the observed and predicted data
        vc_dict = self.vegclass_aa(obs, prd, id_field=self.id_field)

        # Print out the vegclass file
        vc_fh = open(self.vegclass_file, 'w')
        vc_fh.write(','.join((self.id_field, 'OBSERVED', 'PREDICTED')) + '\n')

        # Print out the observed and predicted vegetation classes
        for id_val in sorted(vc_dict.keys()):
            obs_vc = vc_dict[id_val]['obs_vc']
            prd_vc = vc_dict[id_val]['prd_vc']
            out_list = ['%d' % x for x in (id_val, obs_vc, prd_vc)]
            vc_fh.write(','.join(out_list) + '\n')
        vc_fh.close()

        # Create the vegetation class kappa and error matrix files
        vc_xml = 'L:/resources/code/xml/vegclass.xml'
        ca.classification_accuracy(
            self.vegclass_file,
            vc_xml,
            kappa_file=self.vegclass_kappa_file,
            err_matrix_file=self.vegclass_errmatrix_file)
    def run_diagnostic(self):

        # Read the observed and predicted files into numpy recarrays
        obs = utilities.csv2rec(self.observed_file)
        prd = utilities.csv2rec(self.predicted_file)

        # Subset the observed data just to the IDs that are in the
        # predicted file
        obs_keep = np.in1d(
            getattr(obs, self.id_field), getattr(prd, self.id_field))
        obs = obs[obs_keep]

        # Calculate VEGCLASS for both the observed and predicted data
        vc_dict = self.vegclass_aa(obs, prd, id_field=self.id_field)

        # Print out the vegclass file
        vc_fh = open(self.vegclass_file, 'w')
        vc_fh.write(','.join((self.id_field, 'OBSERVED', 'PREDICTED')) + '\n')

        # Print out the observed and predicted vegetation classes
        for id_val in sorted(vc_dict.keys()):
            obs_vc = vc_dict[id_val]['obs_vc']
            prd_vc = vc_dict[id_val]['prd_vc']
            out_list = ['%d' % x for x in (id_val, obs_vc, prd_vc)]
            vc_fh.write(','.join(out_list) + '\n')
        vc_fh.close()

        # Create the vegetation class kappa and error matrix files
        vc_xml = 'L:/resources/code/xml/vegclass.xml'
        ca.classification_accuracy(self.vegclass_file, vc_xml,
            kappa_file=self.vegclass_kappa_file,
            err_matrix_file=self.vegclass_errmatrix_file)
    def run_diagnostic(self):

        # Open the outlier file and write the header line
        vc_outlier_fh = open(self.vegclass_outlier_file, "w")
        header_fields = (self.id_field, "PREDICTION_TYPE", "OBSERVED_VEGCLASS", "PREDICTED_VEGCLASS", "OUTLIER_TYPE")
        vc_outlier_fh.write(",".join(header_fields) + "\n")

        # Run this for both independent and dependent predictions
        for (prd_type, prd_file) in self.predicted_files:

            # Read the observed and predicted files into numpy recarrays
            obs = utilities.csv2rec(self.observed_file)
            prd = utilities.csv2rec(prd_file)

            # Subset the observed data just to the IDs that are in the
            # predicted file
            obs_keep = np.in1d(getattr(obs, self.id_field), getattr(prd, self.id_field))
            obs = obs[obs_keep]

            # Calculate VEGCLASS for both the observed and predicted data
            vc_dict = self.vegclass_aa(obs, prd, id_field=self.id_field)

            # Find the outliers
            outliers = self.find_vegclass_outliers(vc_dict)

            # Print out the outliers
            for outlier in outliers:
                (id, obs_vc, prd_vc, outlier_type) = outlier
                out_fields = ("%d" % id, "%s" % prd_type.upper(), "%d" % obs_vc, "%d" % prd_vc, "%s" % outlier_type)
                vc_outlier_fh.write(",".join(out_fields) + "\n")

        vc_outlier_fh.close()
    def run_diagnostic(self):
        # Open the output file and write the header
        out_fh = open(self.vd_output_file, 'w')
        out_fh.write(
            '%s,PREDICTION_TYPE,VARIABLE,OBSERVED_VALUE,PREDICTED_VALUE\n' % \
            self.id_field)

        # Run this for both independent and dependent predictions
        for (prd_type, prd_file) in self.predicted_files:

            # Read the observed and predicted files into numpy recarrays
            obs_data = utilities.csv2rec(self.observed_file)
            prd_data = utilities.csv2rec(prd_file)

            # Subset the observed data just to the IDs that are in the
            # predicted file
            obs_keep = np.in1d(getattr(obs_data, self.id_field),
                               getattr(prd_data, self.id_field))
            obs_data = obs_data[obs_keep]

            # Iterate over the list of deviation variables, capturing the plots
            # that exceed the minimum threshold specified
            outliers = {}
            for (variable, min_deviation) in self.deviation_variables:
                obs_vals = getattr(obs_data, variable)
                prd_vals = getattr(prd_data, variable)
                abs_diff_vals = np.abs(obs_vals - prd_vals)
                indexes = np.argwhere(abs_diff_vals >= min_deviation)
                outliers[variable] = indexes

            # Create the file of outliers
            for (variable, min_deviation) in self.deviation_variables:
                outlier_list = outliers[variable]
                for index in outlier_list:
                    obs_row = obs_data[index]
                    prd_row = prd_data[index]
                    id = getattr(obs_row, self.id_field)
                    obs_val = getattr(obs_row, variable)
                    prd_val = getattr(prd_row, variable)
                    diff_val = obs_val - prd_val
                    out_data = [
                        '%d' % id,
                        '%s' % prd_type.upper(),
                        '%s' % variable,
                        '%.4f' % obs_val,
                        '%.4f' % prd_val,
                        '%.4f' % diff_val,
                    ]
                    out_fh.write(','.join(out_data) + '\n')

        # Clean up
        out_fh.close()
    def run_diagnostic(self):
        # Open the output file and write the header
        out_fh = open(self.vd_output_file, 'w')
        out_fh.write(
            '%s,PREDICTION_TYPE,VARIABLE,OBSERVED_VALUE,PREDICTED_VALUE\n' % \
            self.id_field)

        # Run this for both independent and dependent predictions
        for (prd_type, prd_file) in self.predicted_files:

            # Read the observed and predicted files into numpy recarrays
            obs_data = utilities.csv2rec(self.observed_file)
            prd_data = utilities.csv2rec(prd_file)

            # Subset the observed data just to the IDs that are in the
            # predicted file
            obs_keep = np.in1d(getattr(obs_data, self.id_field),
                getattr(prd_data, self.id_field))
            obs_data = obs_data[obs_keep]

            # Iterate over the list of deviation variables, capturing the plots
            # that exceed the minimum threshold specified
            outliers = {}
            for (variable, min_deviation) in self.deviation_variables:
                obs_vals = getattr(obs_data, variable)
                prd_vals = getattr(prd_data, variable)
                abs_diff_vals = np.abs(obs_vals - prd_vals)
                indexes = np.argwhere(abs_diff_vals >= min_deviation)
                outliers[variable] = indexes

            # Create the file of outliers
            for (variable, min_deviation) in self.deviation_variables:
                outlier_list = outliers[variable]
                for index in outlier_list:
                    obs_row = obs_data[index]
                    prd_row = prd_data[index]
                    id = getattr(obs_row, self.id_field)
                    obs_val = getattr(obs_row, variable)
                    prd_val = getattr(prd_row, variable)
                    diff_val = obs_val - prd_val
                    out_data = [
                        '%d' % id,
                        '%s' % prd_type.upper(),
                        '%s' % variable,
                        '%.4f' % obs_val,
                        '%.4f' % prd_val,
                        '%.4f' % diff_val,
                    ]
                    out_fh.write(','.join(out_data) + '\n')

        # Clean up
        out_fh.close()
    def _create_scatterplots(self):

        # Open files into recarrays
        obs_data = utilities.csv2rec(self.observed_file)
        prd_data = utilities.csv2rec(self.predicted_file)

        # Subset the obs_data to just those IDs in the predicted data
        ids1 = getattr(obs_data, self.id_field)
        ids2 = getattr(prd_data, self.id_field)
        common_ids = np.in1d(ids1, ids2)
        obs_data = obs_data[common_ids]

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # Subset the attributes to those that are continuous, are accuracy
        # attributes, are identified to go into the report, and are not
        # species variables
        attrs = []
        for attr in mp.attributes:
            if (
                attr.field_type == "CONTINUOUS"
                and attr.project_attr == 1
                and attr.accuracy_attr == 1
                and attr.species_attr == 0
            ):
                attrs.append(attr.field_name)

        # Iterate over the attributes and create a scatterplot file of each
        scatter_files = []
        for attr in attrs:

            # Metadata for this attribute
            metadata = mp.get_attribute(attr)

            # Observed and predicted data matrices for this attribute
            obs_vals = getattr(obs_data, attr)
            prd_vals = getattr(prd_data, attr)

            # Create the output file name
            output_file = attr.lower() + "_scatter.png"

            # Create the scatterplot
            mplf.draw_scatterplot(obs_vals, prd_vals, metadata, output_type=mplf.FILE, output_file=output_file)

            # Add this to the list of scatterplot files
            scatter_files.append(output_file)

        # Return the list of scatterplots just created
        return scatter_files
    def _create_scatterplots(self):

        # Open files into recarrays
        obs_data = utilities.csv2rec(self.observed_file)
        prd_data = utilities.csv2rec(self.predicted_file)

        # Subset the obs_data to just those IDs in the predicted data
        ids1 = getattr(obs_data, self.id_field)
        ids2 = getattr(prd_data, self.id_field)
        common_ids = np.in1d(ids1, ids2)
        obs_data = obs_data[common_ids]

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # Subset the attributes to those that are continuous, are accuracy
        # attributes, are identified to go into the report, and are not
        # species variables
        attrs = []
        for attr in mp.attributes:
            if attr.field_type == 'CONTINUOUS' and attr.project_attr == 1 and \
                    attr.accuracy_attr == 1 and attr.species_attr == 0:
                attrs.append(attr.field_name)

        # Iterate over the attributes and create a scatterplot file of each
        scatter_files = []
        for attr in attrs:

            # Metadata for this attribute
            metadata = mp.get_attribute(attr)

            # Observed and predicted data matrices for this attribute
            obs_vals = getattr(obs_data, attr)
            prd_vals = getattr(prd_data, attr)

            # Create the output file name
            output_file = attr.lower() + '_scatter.png'

            # Create the scatterplot
            mplf.draw_scatterplot(obs_vals,
                                  prd_vals,
                                  metadata,
                                  output_type=mplf.FILE,
                                  output_file=output_file)

            # Add this to the list of scatterplot files
            scatter_files.append(output_file)

        # Return the list of scatterplots just created
        return scatter_files
Exemple #8
0
    def _get_attribute_data(self):
        """
        Retrieve the attribute data for which predictions will be made.  This
        should be called one time, after which it is stored in an instance-
        level variable along with the attribute names (self.attrs)

        Parameters
        ----------
        None

        Returns
        -------
        stand_attr_data : numpy recarray
            Recarray with all stand attributes

        attrs : list of strs
            List of all continuous variables in stand_attr_data
        """

        # Get the stand attribute table and read into a recarray
        p = self.parameter_parser
        stand_attr_file = p.stand_attribute_file
        stand_attr_data = utilities.csv2rec(stand_attr_file)

        # Get the stand attribute metadata and retrieve only the
        # continuous accuracy attributes
        stand_metadata_file = p.stand_metadata_file
        mp = xsmp.XMLStandMetadataParser(stand_metadata_file)
        attrs = [x.field_name for x in mp.attributes
            if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1]

        return (stand_attr_data, attrs)
 def setUp(self):
     self.data_fn = 'data/vegclass.csv'
     self.classifier_fn = 'data/vegclass.xml'
     self.e_ref = 'data/vegclass_errmat.csv'
     self.k_ref = 'data/vegclass_kappa.csv'
     self.data = utilities.csv2rec(self.data_fn)
     self.classifier = ca.Classifier.from_xml(self.classifier_fn)
Exemple #10
0
    def _get_attribute_data(self):
        """
        Retrieve the attribute data for which predictions will be made.  This
        should be called one time, after which it is stored in an instance-
        level variable along with the attribute names (self.attrs)

        Parameters
        ----------
        None

        Returns
        -------
        stand_attr_data : numpy recarray
            Recarray with all stand attributes

        attrs : list of strs
            List of all continuous variables in stand_attr_data
        """

        # Get the stand attribute table and read into a recarray
        p = self.parameter_parser
        stand_attr_file = p.stand_attribute_file
        stand_attr_data = utilities.csv2rec(stand_attr_file)

        # Get the stand attribute metadata and retrieve only the
        # continuous accuracy attributes
        stand_metadata_file = p.stand_metadata_file
        mp = xsmp.XMLStandMetadataParser(stand_metadata_file)
        attrs = [
            x.field_name for x in mp.attributes
            if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1
        ]

        return (stand_attr_data, attrs)
def classification_accuracy(input_file, classifier_file, kappa_file=None,
        err_matrix_file=None, observed_column='OBSERVED',
        predicted_column='PREDICTED'):
    """
    Wrapper function to read in a plot-by-classification file
    of observed and predicted values and a classifier XML file
    and return output kappa statistics and error matrix for a
    given variable

    Parameters
    ----------
    input_file : file
        The input file (comma-separated-value format) with the
        observed and predicted classified values for all plots.
        The file must have a header line with column names.
        Specify the names for the observed and predicted
        columns using the 'observed_column' and 'predicted_column'
        keyword parameters.

    classifier_file : file
        An XML file that describes the variable classification
        including information on fuzzy sets.  This file must
        validate against 'classifier.xsd'.

    kappa_file : file
        Output file to hold kappa and fuzzy kappa statistics.
        Defaults to None (ie. not output).

    err_matrix_file : file
        Output file to hold error matrix statistics.
        Default to None (ie. not output).

    observed_column : string
        The name of the observed column in the input_file.
        Defaults to 'OBSERVED'

    predicted_column : string
        The name of the predicted column in the input_file.
        Defaults to 'PREDICTED'

    Returns
    -------
    None
    """

    # Read in the raw input file
    csv = utilities.csv2rec(input_file)
    obs_data = csv[observed_column]
    prd_data = csv[predicted_column]

    # Read in the classification
    c = Classifier.from_xml(classifier_file)

    # Print classification kappas
    if kappa_file is not None:
        print_kappa_file(obs_data, prd_data, c, kappa_file)

    # Print classification error matrix
    if err_matrix_file is not None:
        print_error_matrix_file(obs_data, prd_data, c, err_matrix_file)
    def run_diagnostic(self):

        # Shortcut to the parameter parser
        p = self.parameter_parser

        # Read in the validation plots file
        validation_plots = utilities.csv2rec(self.observed_file)

        # Create a dictionary of plot ID to image year for these plots
        id_x_year = \
            dict((x[self.id_field], x.IMAGE_YEAR) for x in validation_plots)

        # Create a PredictionRun instance
        pr = prediction_run.PredictionRun(p)

        # Get the neighbors and distances for these IDs
        pr.calculate_neighbors_at_ids(id_x_year, id_field=self.id_field)

        # Retrieve the predicted data for these plots.  In essence, we can
        # retrieve the dependent neighbors because these plot IDs are
        # guaranteed not to be in the model
        prediction_generator = pr.calculate_predictions_at_k(k=p.k,
            id_field=self.id_field, independent=False)

        # Open the predicted file and write out the field names
        out_fh = open(self.predicted_file, 'w')
        out_fh.write(self.id_field + ',' + ','.join(pr.attrs) + '\n')

        # Write out the predictions
        for plot_prediction in prediction_generator:

            # Write this record to the predicted attribute file
            pr.write_predicted_record(plot_prediction, out_fh)

        # Close this file
        out_fh.close()

        # Run the LocalAccuracyDiagnostic on these files
        d = lad.LocalAccuracyDiagnostic(
            observed_file=self.observed_file,
            independent_predicted_file=self.predicted_file,
            stand_metadata_file=self.stand_metadata_file,
            local_accuracy_file=self.local_accuracy_file,
            id_field=self.id_field
        )
        d.run_diagnostic()

        # Run the VegetationClassDiagnostic on these files
        d = vcd.VegetationClassDiagnostic(
            observed_file=self.observed_file,
            independent_predicted_file=self.predicted_file,
            vegclass_file=self.vegclass_file,
            vegclass_kappa_file=self.vegclass_kappa_file,
            vegclass_errmatrix_file=self.vegclass_errmatrix_file,
            id_field=self.id_field,
        )
        d.run_diagnostic()
    def run_diagnostic(self):

        # Shortcut to the parameter parser
        p = self.parameter_parser

        # Read in the validation plots file
        validation_plots = utilities.csv2rec(self.observed_file)

        # Create a dictionary of plot ID to image year for these plots
        id_x_year = \
            dict((x[self.id_field], x.IMAGE_YEAR) for x in validation_plots)

        # Create a PredictionRun instance
        pr = prediction_run.PredictionRun(p)

        # Get the neighbors and distances for these IDs
        pr.calculate_neighbors_at_ids(id_x_year, id_field=self.id_field)

        # Retrieve the predicted data for these plots.  In essence, we can
        # retrieve the dependent neighbors because these plot IDs are
        # guaranteed not to be in the model
        prediction_generator = pr.calculate_predictions_at_k(
            k=p.k, id_field=self.id_field, independent=False)

        # Open the predicted file and write out the field names
        out_fh = open(self.predicted_file, 'w')
        out_fh.write(self.id_field + ',' + ','.join(pr.attrs) + '\n')

        # Write out the predictions
        for plot_prediction in prediction_generator:

            # Write this record to the predicted attribute file
            pr.write_predicted_record(plot_prediction, out_fh)

        # Close this file
        out_fh.close()

        # Run the LocalAccuracyDiagnostic on these files
        d = lad.LocalAccuracyDiagnostic(
            observed_file=self.observed_file,
            independent_predicted_file=self.predicted_file,
            stand_metadata_file=self.stand_metadata_file,
            local_accuracy_file=self.local_accuracy_file,
            id_field=self.id_field)
        d.run_diagnostic()

        # Run the VegetationClassDiagnostic on these files
        d = vcd.VegetationClassDiagnostic(
            observed_file=self.observed_file,
            independent_predicted_file=self.predicted_file,
            vegclass_file=self.vegclass_file,
            vegclass_kappa_file=self.vegclass_kappa_file,
            vegclass_errmatrix_file=self.vegclass_errmatrix_file,
            id_field=self.id_field,
        )
        d.run_diagnostic()
    def run_diagnostic(self):

        # Read in the dependent nn_index_file
        in_data = utilities.csv2rec(self.nn_index_file)

        # Subset the observed data to just those values above the
        # index threshold
        in_data = in_data[in_data.AVERAGE_POSITION >= self.index_threshold]

        # Write out the resulting recarray
        utilities.rec2csv(in_data, self.nn_index_outlier_file)
Exemple #15
0
 def list_points(self):
     points = []
     if self.domain == 'list':
         child_elem = (self.domain_element.getchildren())[0]
         if child_elem.tag == 'points':
             points = \
                 [(point.x, point.y) for point in child_elem.getchildren()]
         else:
             recs = utilities.csv2rec(str(child_elem))
             points = [(point.X, point.Y) for point in recs]
     return points
Exemple #16
0
    def load_outliers(self):
        p = self.parameter_parser

        #read in outlier files and push results to DB
        for d in p.outlier_diagnostics:
            outlier_diag = (self.diagnostic_type[d])(p)
            outlier_file = outlier_diag.get_outlier_filename()
            outlier_formatter = (self.outlier_formatter[d])(p)
            out_rec = utilities.csv2rec(outlier_file)
            if out_rec is not None:
                outlier_formatter.load_outliers(out_rec)
    def run_diagnostic(self):

        # Read in the dependent nn_index_file
        in_data = utilities.csv2rec(self.nn_index_file)

        # Subset the observed data to just those values above the
        # index threshold
        in_data = in_data[in_data.AVERAGE_POSITION >= self.index_threshold]

        # Write out the resulting recarray
        utilities.rec2csv(in_data, self.nn_index_outlier_file)
Exemple #18
0
    def create_predictions(self, no_self_assign_field='LOC_ID'):
        """
        Creates model predictions and zonal pixel files from independent
        predictions, ie. plots are not able to use themselves (or other
        'dependent' plots) as neighbors

        Parameters
        ----------
        no_self_assign_field : str
            ID field at which no self assignment is allowed.
            Defaults to LOC_ID

        Returns
        -------
        None
        """

        # Aliases
        p = self.parameter_parser
        pr = self.prediction_run

        # Create a dictionary between id_field and no_self_assign_field
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)
        nsaf = no_self_assign_field
        nsa_id_dict = dict(
            (getattr(x, self.id_field), getattr(x, nsaf)) for x in env_data)

        # Open the prediction files
        zonal_pixel_file = p.independent_zonal_pixel_file
        predicted_file = p.independent_predicted_file
        zonal_pixel_fh, predicted_fh = \
            self.open_prediction_files(zonal_pixel_file, predicted_file)

        # Create a generator for each ID in pr.neighbor_data
        prediction_generator = pr.calculate_predictions_at_k(
            k=p.k,
            id_field=self.id_field,
            independent=True,
            nsa_id_dict=nsa_id_dict)

        # Iterate over each prediction writing them out to the zonal pixel
        # and predicted attribute files
        for plot_prediction in prediction_generator:

            # Write this record to the zonal pixel file
            pr.write_zonal_pixel_record(plot_prediction, zonal_pixel_fh)

            # Write this record to the predicted attribute file
            pr.write_predicted_record(plot_prediction, predicted_fh)

        # Close files
        zonal_pixel_fh.close()
        predicted_fh.close()
 def list_points(self):
     points = []
     if self.domain == 'list':
         child_elem = (self.domain_element.getchildren())[0]
         if child_elem.tag == 'points':
             points = \
                 [(point.x, point.y) for point in child_elem.getchildren()]
         else:
             recs = utilities.csv2rec(str(child_elem))
             points = [(point.X, point.Y) for point in recs]
     return points
    def run_diagnostic(self):

        # Open the outlier file and write the header line
        vc_outlier_fh = open(self.vegclass_outlier_file, 'w')
        header_fields = (self.id_field, 'PREDICTION_TYPE', 'OBSERVED_VEGCLASS',
                         'PREDICTED_VEGCLASS', 'OUTLIER_TYPE')
        vc_outlier_fh.write(','.join(header_fields) + '\n')

        # Run this for both independent and dependent predictions
        for (prd_type, prd_file) in self.predicted_files:

            # Read the observed and predicted files into numpy recarrays
            obs = utilities.csv2rec(self.observed_file)
            prd = utilities.csv2rec(prd_file)

            # Subset the observed data just to the IDs that are in the
            # predicted file
            obs_keep = np.in1d(getattr(obs, self.id_field),
                               getattr(prd, self.id_field))
            obs = obs[obs_keep]

            # Calculate VEGCLASS for both the observed and predicted data
            vc_dict = self.vegclass_aa(obs, prd, id_field=self.id_field)

            # Find the outliers
            outliers = self.find_vegclass_outliers(vc_dict)

            # Print out the outliers
            for outlier in outliers:
                (id, obs_vc, prd_vc, outlier_type) = outlier
                out_fields = (
                    '%d' % id,
                    '%s' % prd_type.upper(),
                    '%d' % obs_vc,
                    '%d' % prd_vc,
                    '%s' % outlier_type,
                )
                vc_outlier_fh.write(','.join(out_fields) + '\n')

        vc_outlier_fh.close()
    def _create_histograms(self):

        # Open the area estimate file into a recarray
        ae_data = utilities.csv2rec(self.regional_accuracy_file)

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # Subset the attributes to those that are accuracy attributes,
        # are identified to go into the report, and are not species variables
        attrs = []
        for attr in mp.attributes:
            if attr.accuracy_attr == 1 and attr.project_attr == 1 and \
                    attr.species_attr == 0:
                attrs.append(attr.field_name)

        # Iterate over the attributes and create a histogram file of each
        histogram_files = []
        for attr in attrs:

            # Metadata for this attribute
            metadata = mp.get_attribute(attr)

            # Get the observed and predicted data for this attribute
            obs_vals = self._get_histogram_data(ae_data, attr, 'OBSERVED')
            prd_vals = self._get_histogram_data(ae_data, attr, 'PREDICTED')

            # Set the areas for the observed and predicted data
            obs_area = obs_vals.AREA
            prd_area = prd_vals.AREA

            # Set the bin names (same for both observed and predicted series)
            bin_names = obs_vals.BIN_NAME
            if np.all(bin_names != prd_vals.BIN_NAME):
                err_msg = 'Bin names are not the same for ' + attr
                raise ValueError(err_msg)

            # Create the output file name
            output_file = attr.lower() + '_histogram.png'

            # Create the histogram
            mplf.draw_histogram([obs_area, prd_area],
                                bin_names,
                                metadata,
                                output_type=mplf.FILE,
                                output_file=output_file)

            # Add this to the list of histogram files
            histogram_files.append(output_file)

        # Return the list of histograms just created
        return histogram_files
Exemple #22
0
    def create_predictions(self, no_self_assign_field='LOC_ID'):
        """
        Creates model predictions and zonal pixel files from independent
        predictions, ie. plots are not able to use themselves (or other
        'dependent' plots) as neighbors

        Parameters
        ----------
        no_self_assign_field : str
            ID field at which no self assignment is allowed.
            Defaults to LOC_ID

        Returns
        -------
        None
        """

        # Aliases
        p = self.parameter_parser
        pr = self.prediction_run

        # Create a dictionary between id_field and no_self_assign_field
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)
        nsaf = no_self_assign_field
        nsa_id_dict = dict((getattr(x, self.id_field), getattr(x, nsaf))
            for x in env_data)

        # Open the prediction files
        zonal_pixel_file = p.independent_zonal_pixel_file
        predicted_file = p.independent_predicted_file
        zonal_pixel_fh, predicted_fh = \
            self.open_prediction_files(zonal_pixel_file, predicted_file)

        # Create a generator for each ID in pr.neighbor_data
        prediction_generator = pr.calculate_predictions_at_k(
            k=p.k, id_field=self.id_field, independent=True,
            nsa_id_dict=nsa_id_dict)

        # Iterate over each prediction writing them out to the zonal pixel
        # and predicted attribute files
        for plot_prediction in prediction_generator:

            # Write this record to the zonal pixel file
            pr.write_zonal_pixel_record(plot_prediction, zonal_pixel_fh)

            # Write this record to the predicted attribute file
            pr.write_predicted_record(plot_prediction, predicted_fh)

        # Close files
        zonal_pixel_fh.close()
        predicted_fh.close()
    def run_diagnostic(self):

        # Open the stand attribute file and subset to just positive IDs
        attr_data = utilities.csv2rec(self.stand_attr_file)
        cond = np.where(getattr(attr_data, self.id_field) > 0)
        attr_data = attr_data[cond]

        # Create a simple dictionary of ID to vegetation class from the
        # attr_data
        vc_dict = dict((getattr(x, self.id_field), getattr(x, 'VEGCLASS'))
            for x in attr_data)

        # Open the output file and write the header
        out_fh = open(self.output_file, 'w')
        out_fh.write('%s,PREDICTION_TYPE\n' % self.id_field)

        # Run this for both independent and dependent predictions
        for (prd_type, zp_file) in self.zonal_pixel_files:

            # Open the zonal pixel file
            zonal_data = utilities.csv2rec(zp_file)

            # For each ID in zonal_data, retrieve the vegetation class of its
            # neighbors
            ids = getattr(attr_data, self.id_field)
            for id in ids:
                cond = np.where(getattr(zonal_data, self.id_field) == id)
                zonal_records = zonal_data[cond]
                vc_records = [vc_dict[x] for x in zonal_records.NEIGHBOR_ID]

                # Apply the logic for the variety
                outlier = self.calculate_vc_variety(vc_records)

                if outlier:
                    out_fh.write('%d,%s\n' % (id, prd_type.upper()))

        # Clean up
        out_fh.close()
Exemple #24
0
    def run_diagnostic(self):

        # Open the stand attribute file and subset to just positive IDs
        attr_data = utilities.csv2rec(self.stand_attr_file)
        cond = np.where(getattr(attr_data, self.id_field) > 0)
        attr_data = attr_data[cond]

        # Create a simple dictionary of ID to vegetation class from the
        # attr_data
        vc_dict = dict((getattr(x, self.id_field), getattr(x, 'VEGCLASS'))
                       for x in attr_data)

        # Open the output file and write the header
        out_fh = open(self.output_file, 'w')
        out_fh.write('%s,PREDICTION_TYPE\n' % self.id_field)

        # Run this for both independent and dependent predictions
        for (prd_type, zp_file) in self.zonal_pixel_files:

            # Open the zonal pixel file
            zonal_data = utilities.csv2rec(zp_file)

            # For each ID in zonal_data, retrieve the vegetation class of its
            # neighbors
            ids = getattr(attr_data, self.id_field)
            for id in ids:
                cond = np.where(getattr(zonal_data, self.id_field) == id)
                zonal_records = zonal_data[cond]
                vc_records = [vc_dict[x] for x in zonal_records.NEIGHBOR_ID]

                # Apply the logic for the variety
                outlier = self.calculate_vc_variety(vc_records)

                if outlier:
                    out_fh.write('%d,%s\n' % (id, prd_type.upper()))

        # Clean up
        out_fh.close()
    def _create_histograms(self):

        # Open the area estimate file into a recarray
        ae_data = utilities.csv2rec(self.regional_accuracy_file)

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # Subset the attributes to those that are accuracy attributes,
        # are identified to go into the report, and are not species variables
        attrs = []
        for attr in mp.attributes:
            if attr.accuracy_attr == 1 and attr.project_attr == 1 and \
                    attr.species_attr == 0:
                attrs.append(attr.field_name)

        # Iterate over the attributes and create a histogram file of each
        histogram_files = []
        for attr in attrs:

            # Metadata for this attribute
            metadata = mp.get_attribute(attr)

            # Get the observed and predicted data for this attribute
            obs_vals = self._get_histogram_data(ae_data, attr, 'OBSERVED')
            prd_vals = self._get_histogram_data(ae_data, attr, 'PREDICTED')

            # Set the areas for the observed and predicted data
            obs_area = obs_vals.AREA
            prd_area = prd_vals.AREA

            # Set the bin names (same for both observed and predicted series)
            bin_names = obs_vals.BIN_NAME
            if np.all(bin_names != prd_vals.BIN_NAME):
                err_msg = 'Bin names are not the same for ' + attr
                raise ValueError(err_msg)

            # Create the output file name
            output_file = attr.lower() + '_histogram.png'

            # Create the histogram
            mplf.draw_histogram([obs_area, prd_area], bin_names, metadata,
                output_type=mplf.FILE, output_file=output_file)

            # Add this to the list of histogram files
            histogram_files.append(output_file)

        # Return the list of histograms just created
        return histogram_files
Exemple #26
0
    def calculate_neighbors_cross_validation(self):
        """
        Wrapper around get_predicted_neighbors_at_ids optimized for cross-
        validation (ie. using plots that went into model development).

        Parameters
        ----------
        None

        Returns
        -------
        None
        """

        # Alias for self.parameter_parser
        p = self.parameter_parser

        # ID field
        id_field = p.summary_level + 'ID'

        # Get the environmental matrix file and read the plot IDs
        # and image years into a dictionary
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)

        # Associate each plot with a model year; this is either the year the
        # model is associated with (for models that use imagery), or the
        # model_year (for models that don't use imagery)
        if p.model_type in p.imagery_model_types:
            id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in env_data)
        else:
            id_x_year = dict((x[id_field], p.model_year) for x in env_data)

        # Subset the plot ID list down to just those plots that went into
        # imputation.  This may be a subset of the plots that are in the
        # environmental matrix file based on running GNN in a unique way.
        # This requires parsing the model and extracting just the plot IDs
        ord_file = p.get_ordination_file()
        lop = lemma_ordination_parser.LemmaOrdinationParser()
        ord_model = lop.parse(ord_file, delimiter=',')
        plot_ids = ord_model.plot_ids
        id_x_year = dict(
            (i, id_x_year[i]) for i in id_x_year.keys() if i in plot_ids)

        # Call the main function
        self.calculate_neighbors_at_ids(id_x_year, id_field=id_field)
Exemple #27
0
    def calculate_neighbors_cross_validation(self):
        """
        Wrapper around get_predicted_neighbors_at_ids optimized for cross-
        validation (ie. using plots that went into model development).

        Parameters
        ----------
        None

        Returns
        -------
        None
        """

        # Alias for self.parameter_parser
        p = self.parameter_parser

        # ID field
        id_field = p.summary_level + 'ID'

        # Get the environmental matrix file and read the plot IDs
        # and image years into a dictionary
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)

        # Associate each plot with a model year; this is either the year the
        # model is associated with (for models that use imagery), or the
        # model_year (for models that don't use imagery)
        if p.model_type in p.imagery_model_types:
            id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in env_data)
        else:
            id_x_year = dict((x[id_field], p.model_year) for x in env_data)

        # Subset the plot ID list down to just those plots that went into
        # imputation.  This may be a subset of the plots that are in the
        # environmental matrix file based on running GNN in a unique way.
        # This requires parsing the model and extracting just the plot IDs
        ord_file = p.get_ordination_file()
        lop = lemma_ordination_parser.LemmaOrdinationParser()
        ord_model = lop.parse(ord_file, delimiter=',')
        plot_ids = ord_model.plot_ids
        id_x_year = dict(
            (i, id_x_year[i]) for i in id_x_year.keys() if i in plot_ids)

        # Call the main function
        self.calculate_neighbors_at_ids(id_x_year, id_field=id_field)
    def get_observed_estimates(self):

        # Read the area estimate file into a recarray
        obs_data = utilities.csv2rec(self.area_estimate_file)

        # Get the nonforest hectares (coded as -10001)
        nf_row = obs_data[getattr(obs_data, self.id_field) == -10001][0]
        nf_hectares = nf_row.HECTARES

        # Get the nonsampled hectares (coded as -10002)
        ns_row = obs_data[getattr(obs_data, self.id_field) == -10002][0]
        ns_hectares = ns_row.HECTARES

        # Remove these rows from the recarray
        obs_data = obs_data[getattr(obs_data, self.id_field) > 0]

        # Return this information
        return obs_data, nf_hectares, ns_hectares
Exemple #29
0
    def get_observed_estimates(self):

        # Read the area estimate file into a recarray
        obs_data = utilities.csv2rec(self.area_estimate_file)

        # Get the nonforest hectares (coded as -10001)
        nf_row = obs_data[getattr(obs_data, self.id_field) == -10001][0]
        nf_hectares = nf_row.HECTARES

        # Get the nonsampled hectares (coded as -10002)
        ns_row = obs_data[getattr(obs_data, self.id_field) == -10002][0]
        ns_hectares = ns_row.HECTARES

        # Remove these rows from the recarray
        obs_data = obs_data[getattr(obs_data, self.id_field) > 0]

        # Return this information
        return obs_data, nf_hectares, ns_hectares
    def get_predicted_estimates(self):

        # Read in the predicted raster
        ds = gdal.Open(self.predicted_raster, gdalconst.GA_ReadOnly)
        rat = ds.GetRasterBand(1).GetDefaultRAT()

        # Get the cell area for converting from pixel counts to hectares
        gt = ds.GetGeoTransform()
        cell_area = gt[1] * gt[1]

        # Get the IDs and counts (converted to hectares)
        id_recs = []
        nf_hectares = 0
        for i in range(rat.GetRowCount()):
            id = rat.GetValueAsInt(i, 0)
            hectares = rat.GetValueAsInt(i, 1) * cell_area / 10000.0
            if id <= 0:
                nf_hectares += hectares
            else:
                id_recs.append((id, hectares))

        # Release the dataset
        ds = None

        # Convert this to a recarray
        names = (self.id_field, 'HECTARES')
        ids = np.rec.fromrecords(id_recs, names=names)

        # Read in the attribute file
        sad = utilities.csv2rec(self.stand_attribute_file)

        # Ensure that all IDs in the id_count_dict are in the attribute data
        ids_1 = getattr(ids, self.id_field)
        ids_2 = getattr(sad, self.id_field)
        if not np.all(np.in1d(ids_1, ids_2)):
            err_msg = 'Not all values in the raster are present in the '
            err_msg += 'attribute data'
            raise ValueError(err_msg)

        # Join the two recarrays together
        predicted_data = mlab.rec_join(self.id_field, ids, sad)
        return (predicted_data, nf_hectares)
Exemple #31
0
    def get_predicted_estimates(self):

        # Read in the predicted raster
        ds = gdal.Open(self.predicted_raster, gdalconst.GA_ReadOnly)
        rat = ds.GetRasterBand(1).GetDefaultRAT()

        # Get the cell area for converting from pixel counts to hectares
        gt = ds.GetGeoTransform()
        cell_area = gt[1] * gt[1]

        # Get the IDs and counts (converted to hectares)
        id_recs = []
        nf_hectares = 0
        for i in range(rat.GetRowCount()):
            id = rat.GetValueAsInt(i, 0)
            hectares = rat.GetValueAsInt(i, 1) * cell_area / 10000.0
            if id <= 0:
                nf_hectares += hectares
            else:
                id_recs.append((id, hectares))

        # Release the dataset
        ds = None

        # Convert this to a recarray
        names = (self.id_field, 'HECTARES')
        ids = np.rec.fromrecords(id_recs, names=names)

        # Read in the attribute file
        sad = utilities.csv2rec(self.stand_attribute_file)

        # Ensure that all IDs in the id_count_dict are in the attribute data
        ids_1 = getattr(ids, self.id_field)
        ids_2 = getattr(sad, self.id_field)
        if not np.all(np.in1d(ids_1, ids_2)):
            err_msg = 'Not all values in the raster are present in the '
            err_msg += 'attribute data'
            raise ValueError(err_msg)

        # Join the two recarrays together
        predicted_data = mlab.rec_join(self.id_field, ids, sad)
        return (predicted_data, nf_hectares)
    def _create_story(self):

        # Set up an empty list to hold the story
        story = []

        # Import the report styles
        styles = report_styles.get_report_styles()

        # Create a page break
        story = self._make_page_break(story, self.PORTRAIT)

        # Section title
        title_str = '<strong>Local-Scale Accuracy Assessment:<br/>'
        title_str += 'Species Accuracy at Plot Locations'
        title_str += '</strong>'

        para = p.Paragraph(title_str, styles['section_style'])
        t = p.Table([[para]], colWidths=[7.5 * u.inch])
        t.setStyle(
            p.TableStyle([
                ('TOPPADDING', (0, 0), (-1, -1), 6),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
                ('BACKGROUND', (0, 0), (-1, -1), '#957348'),
                ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
            ]))
        story.append(t)
        story.append(p.Spacer(0, 0.2 * u.inch))

        # Kappa explanation
        kappa_str = '''
            Cohen's kappa coefficient (Cohen, 1960) is a statistical measure
            of reliability, accounting for agreement occurring by chance.  
            The equation for kappa is: 
        '''
        para = p.Paragraph(kappa_str, styles['body_style'])
        story.append(para)
        story.append(p.Spacer(0, 0.05 * u.inch))

        kappa_str = '''
           kappa = (Pr(a) - Pr(e)) / (1.0 - Pr(e))
        '''
        para = p.Paragraph(kappa_str, styles['indented'])
        story.append(para)
        story.append(p.Spacer(0, 0.05 * u.inch))

        kappa_str = '''
            where Pr(a) is the relative observed agreement among
            raters, and Pr(e) is the probability that agreement is
            due to chance.<br/><br/>

            <strong>Abbreviations Used:</strong><br/>
            OP/PP = Observed Present / Predicted Present<br/>
            OA/PP = Observed Absent / Predicted Present
            (errors of commission)<br/>
            OP/PA = Observed Present / Predicted Absent
            (errors of ommission)<br/>
            OA/PA = Observed Absent / Predicted Absent
        '''
        para = p.Paragraph(kappa_str, styles['body_style'])
        story.append(para)
        story.append(p.Spacer(0, 0.2 * u.inch))

        # Create a list of lists to hold the species accuracy information
        species_table = []

        # Header row
        header_row = []

        spp_str = '<strong>Species PLANTS Code<br/>'
        spp_str += 'Scientific Name / Common Name</strong>'
        para = p.Paragraph(spp_str, styles['body_style_10'])
        header_row.append(para)

        spp_str = '<strong>Species prevalence</strong>'
        para = p.Paragraph(spp_str, styles['body_style_10'])
        header_row.append(para)

        p1 = p.Paragraph('<strong>OP/PP</strong>',
            styles['body_style_10_right'])
        p2 = p.Paragraph('<strong>OP/PA</strong>',
            styles['body_style_10_right'])
        p3 = p.Paragraph('<strong>OA/PP</strong>',
            styles['body_style_10_right'])
        p4 = p.Paragraph('<strong>OA/PA</strong>',
            styles['body_style_10_right'])
        header_cells = [[p1, p2], [p3, p4]]
        t = p.Table(header_cells, colWidths=[0.75 * u.inch, 0.75 * u.inch])
        t.setStyle(
            p.TableStyle([
                ('GRID', (0, 0), (-1, -1), 1, colors.white),
                ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                ('TOPPADDING', (0, 0), (-1, -1), 2),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
            ]))
        header_row.append(t)

        kappa_str = '<strong>Kappa coefficient</strong>'
        para = p.Paragraph(kappa_str, styles['body_style_10'])
        header_row.append(para)
        species_table.append(header_row)

        # Open the species accuracy file into a recarray
        spp_data = utilities.csv2rec(self.species_accuracy_file)

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # Read in the report metadata if it exists
        if self.report_metadata_file:
            rmp = xrmp.XMLReportMetadataParser(self.report_metadata_file)
        else:
            rmp = None

        # Subset the attributes to just species
        attrs = []
        for attr in mp.attributes:
            if attr.species_attr == 1 and 'NOTALY' not in attr.field_name:
                attrs.append(attr.field_name)

        # Iterate over the species and print out the statistics
        for spp in attrs:

            # Empty row to hold the formatted output
            species_row = []

            # Get the scientific and common names from the report metadata
            # if it exists; otherwise, just use the species symbol
            if rmp is not None:

                # Strip off any suffix if it exists
                try:
                    spp_plain = spp.split('_')[0]
                    spp_info = rmp.get_species(spp_plain)
                    spp_str = spp_info.spp_symbol + '<br/>'
                    spp_str += spp_info.scientific_name + ' / '
                    spp_str += spp_info.common_name
                except IndexError:
                    spp_str = spp
            else:
                spp_str = spp
            para = p.Paragraph(spp_str, styles['body_style_10'])
            species_row.append(para)

            # Get the statistical information
            data = spp_data[spp_data.SPECIES == spp][0]
            counts = [data.OP_PP, data.OP_PA, data.OA_PP, data.OA_PA]
            prevalence = data.PREVALENCE
            kappa = data.KAPPA

            # Species prevalence
            prevalence_str = '%.4f' % prevalence
            para = p.Paragraph(prevalence_str, styles['body_style_10_right'])
            species_row.append(para)

            # Capture the plot counts in an inner table
            count_cells = []
            count_row = []
            for i in range(0, 4):
                para = p.Paragraph(
                    '%d' % counts[i], styles['body_style_10_right'])
                count_row.append(para)
                if i % 2 == 1:
                    count_cells.append(count_row)
                    count_row = []
            t = p.Table(count_cells, colWidths=[0.75 * u.inch, 0.75 * u.inch])
            t.setStyle(
                p.TableStyle([
                    ('GRID', (0, 0), (-1, -1), 1, colors.white),
                    ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'),
                    ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                    ('TOPPADDING', (0, 0), (-1, -1), 2),
                    ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
                ]))
            species_row.append(t)

            # Print out the kappa statistic
            kappa_str = '%.4f' % kappa
            para = p.Paragraph(kappa_str, styles['body_style_10_right'])
            species_row.append(para)

            # Push this row to the master species table
            species_table.append(species_row)

        # Style this into a reportlab table and add to the story
        col_widths = [(x * u.inch) for x in [4.0, 0.75, 1.5, 0.75]]
        t = p.Table(species_table, colWidths=col_widths)
        t.setStyle(
            p.TableStyle([
                ('BACKGROUND', (0, 0), (-1, -1), '#f1efe4'),
                ('GRID', (0, 0), (-1, -1), 2, colors.white),

                ('TOPPADDING', (0, 0), (0, -1), 2),
                ('BOTTOMPADDING', (0, 0), (0, -1), 2),
                ('LEFTPADDING', (0, 0), (0, -1), 6),
                ('RIGHTPADDING', (0, 0), (0, -1), 6),
                ('ALIGNMENT', (0, 0), (0, -1), 'LEFT'),
                ('VALIGN', (0, 0), (0, -1), 'TOP'),

                ('TOPPADDING', (1, 0), (1, -1), 2),
                ('BOTTOMPADDING', (1, 0), (1, -1), 2),
                ('LEFTPADDING', (1, 0), (1, -1), 6),
                ('RIGHTPADDING', (1, 0), (1, -1), 6),
                ('ALIGNMENT', (1, 0), (1, -1), 'RIGHT'),
                ('VALIGN', (1, 0), (1, 0), 'TOP'),
                ('VALIGN', (1, 1), (1, -1), 'MIDDLE'),

                ('TOPPADDING', (2, 0), (2, -1), 0),
                ('BOTTOMPADDING', (2, 0), (2, -1), 0),
                ('LEFTPADDING', (2, 0), (2, -1), 0),
                ('RIGHTPADDING', (2, 0), (2, -1), 0),
                ('ALIGNMENT', (2, 0), (2, -1), 'LEFT'),
                ('VALIGN', (2, 0), (2, -1), 'TOP'),

                ('TOPPADDING', (3, 0), (3, -1), 2),
                ('BOTTOMPADDING', (3, 0), (3, -1), 2),
                ('LEFTPADDING', (3, 0), (3, -1), 6),
                ('RIGHTPADDING', (3, 0), (3, -1), 6),
                ('ALIGNMENT', (3, 0), (3, -1), 'RIGHT'),
                ('VALIGN', (3, 0), (3, 0), 'TOP'),
                ('VALIGN', (3, 1), (3, -1), 'MIDDLE'),
            ]))
        story.append(t)
        story.append(p.Spacer(0, 0.1 * u.inch))

        rare_species_str = """
            Note that some very rare species do not appear in this accuracy
            report, because these species were not included when building
            the initial ordination model.  The full set of species is
            available upon request.
        """
        para = p.Paragraph(rare_species_str, styles['body_style'])
        story.append(para)

        # Return this story
        return story
def classification_accuracy(input_file,
                            classifier_file,
                            kappa_file=None,
                            err_matrix_file=None,
                            observed_column='OBSERVED',
                            predicted_column='PREDICTED'):
    """
    Wrapper function to read in a plot-by-classification file
    of observed and predicted values and a classifier XML file
    and return output kappa statistics and error matrix for a
    given variable

    Parameters
    ----------
    input_file : file
        The input file (comma-separated-value format) with the
        observed and predicted classified values for all plots.
        The file must have a header line with column names.
        Specify the names for the observed and predicted
        columns using the 'observed_column' and 'predicted_column'
        keyword parameters.

    classifier_file : file
        An XML file that describes the variable classification
        including information on fuzzy sets.  This file must
        validate against 'classifier.xsd'.

    kappa_file : file
        Output file to hold kappa and fuzzy kappa statistics.
        Defaults to None (ie. not output).

    err_matrix_file : file
        Output file to hold error matrix statistics.
        Default to None (ie. not output).

    observed_column : string
        The name of the observed column in the input_file.
        Defaults to 'OBSERVED'

    predicted_column : string
        The name of the predicted column in the input_file.
        Defaults to 'PREDICTED'

    Returns
    -------
    None
    """

    # Read in the raw input file
    csv = utilities.csv2rec(input_file)
    obs_data = csv[observed_column]
    prd_data = csv[predicted_column]

    # Read in the classification
    c = Classifier.from_xml(classifier_file)

    # Print classification kappas
    if kappa_file is not None:
        print_kappa_file(obs_data, prd_data, c, kappa_file)

    # Print classification error matrix
    if err_matrix_file is not None:
        print_error_matrix_file(obs_data, prd_data, c, err_matrix_file)
Exemple #34
0
    def run_diagnostic(self):

        # Shortcut to the parameter parser
        p = self.parameter_parser

        # ID field
        id_field = p.summary_level + 'ID'

        # Root directory for Riemann files
        root_dir = p.riemann_output_folder

        # Read in hex input file
        obs_data = utilities.csv2rec(self.hex_attribute_file)

        # Get the hexagon levels and ensure that the fields exist in the
        # hex_attribute file
        hex_resolutions = p.riemann_hex_resolutions
        hex_fields = [x[0] for x in hex_resolutions]
        for field in hex_fields:
            if field not in obs_data.dtype.names:
                err_msg = 'Field ' + field + ' does not exist in the '
                err_msg += 'hex_attribute file'
                raise ValueError(err_msg)

        # Create the directory structure based on the hex levels
        hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions]
        all_levels = ['plot_pixel'] + hex_levels
        for level in all_levels:
            sub_dir = os.path.join(root_dir, level)
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)

        # Get the values of k
        k_values = p.riemann_k_values

        # Create a dictionary of plot ID to image year (or model_year for
        # non-imagery models) for these plots
        if p.model_type in p.imagery_model_types:
            id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data)
        else:
            id_x_year = dict((x[id_field], p.model_year) for x in obs_data)

        # Create a PredictionRun instance
        pr = prediction_run.PredictionRun(p)

        # Get the neighbors and distances for these IDs
        pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field)

        # Create the lookup of id_field to LOC_ID for the hex plots
        nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data)

        # Create a dictionary between id_field and no_self_assign_field
        # for the model plots
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)
        model_nsa_id_dict = dict(
            (getattr(x, id_field), x.LOC_ID) for x in env_data)

        # Stitch the two dictionaries together
        for id in sorted(model_nsa_id_dict.keys()):
            if id not in nsa_id_dict:
                nsa_id_dict[id] = model_nsa_id_dict[id]

        # Get the stand attribute metadata and retrieve only the
        # continuous accuracy attributes
        stand_metadata_file = p.stand_metadata_file
        mp = xsmp.XMLStandMetadataParser(stand_metadata_file)
        attrs = [
            x.field_name for x in mp.attributes
            if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1
        ]

        # Subset the attributes for fields that are in the
        # hex_attribute file
        attrs = [x for x in attrs if x in obs_data.dtype.names]
        plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs)

        # Write out the plot_pixel observed file
        file_name = 'plot_pixel_observed.csv'
        output_file = os.path.join(root_dir, 'plot_pixel', file_name)
        utilities.rec2csv(plot_pixel_obs, output_file)

        # Iterate over values of k
        for k in k_values:

            # Construct the output file name
            file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k)))
            file_name += '.csv'
            output_file = os.path.join(root_dir, 'plot_pixel', file_name)
            out_fh = open(output_file, 'w')

            # For the plot/pixel scale, retrieve the independent predicted
            # data for this value of k.  Even though attributes are being
            # returned from this function, we want to use the attribute list
            # that we've already found above.
            prediction_generator = pr.calculate_predictions_at_k(
                k=k,
                id_field=id_field,
                independent=True,
                nsa_id_dict=nsa_id_dict)

            # Write out the field names
            out_fh.write(id_field + ',' + ','.join(attrs) + '\n')

            # Write out the predictions for this k
            for plot_prediction in prediction_generator:

                # Write this record to the predicted attribute file
                pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs)

            # Close this file
            out_fh.close()

        # Create the fields for which to extract statistics at the hexagon
        # levels
        mean_fields = [(id_field, len, 'PLOT_COUNT')]
        mean_fields.extend([(x, np.mean, x) for x in attrs])
        mean_fields = tuple(mean_fields)

        sd_fields = [(id_field, len, 'PLOT_COUNT')]
        sd_fields.extend([(x, np.std, x) for x in attrs])
        sd_fields = tuple(sd_fields)

        stat_sets = {
            'mean': mean_fields,
            'std': sd_fields,
        }

        # For each hexagon level, associate the plots with their hexagon ID
        # and find observed and predicted statistics for each hexagon
        for hex_resolution in hex_resolutions:

            (hex_id_field, hex_distance) = hex_resolution[0:2]
            min_plots_per_hex = hex_resolution[3]
            prefix = 'hex_' + str(hex_distance)

            # Create a crosswalk between the id_field and the hex_id_field
            id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field])

            # Iterate over all sets of statistics and write a unique file
            # for each set
            for (stat_name, stat_fields) in stat_sets.iteritems():

                # Get the output file name
                obs_out_file = \
                    '_'.join((prefix, 'observed', stat_name)) + '.csv'
                obs_out_file = os.path.join(root_dir, prefix, obs_out_file)

                # Write out the observed file
                self.write_hex_stats(obs_data, hex_id_field, stat_fields,
                                     min_plots_per_hex, obs_out_file)

            # Iterate over values of k for the predicted values
            for k in k_values:

                # Open the plot_pixel predicted file for this value of k
                # and join the hex_id_field to the recarray
                prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
                prd_file = os.path.join(root_dir, 'plot_pixel', prd_file)
                prd_data = utilities.csv2rec(prd_file)
                prd_data = mlab.rec_join(id_field, prd_data, id_x_hex)

                # Iterate over all sets of statistics and write a unique file
                # for each set
                for (stat_name, stat_fields) in stat_sets.iteritems():

                    # Get the output file name
                    prd_out_file = '_'.join((prefix, 'predicted', 'k' + str(k),
                                             stat_name)) + '.csv'
                    prd_out_file = os.path.join(root_dir, prefix, prd_out_file)

                    # Write out the predicted file
                    self.write_hex_stats(prd_data, hex_id_field, stat_fields,
                                         min_plots_per_hex, prd_out_file)

        # Calculate the ECDF and AC statistics
        # For ECDF and AC, it is a paired comparison between the observed
        # and predicted data.  We do this at each value of k and for each
        # hex resolution level.

        # Open the stats file
        stats_file = p.hex_statistics_file
        stats_fh = open(stats_file, 'w')
        header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE']
        stats_fh.write(','.join(header_fields) + '\n')

        # Create a list of RiemannComparison instances which store the
        # information needed to do comparisons between observed and predicted
        # files for any level or value of k
        compare_list = []
        for hex_resolution in hex_resolutions:
            (hex_id_field, hex_distance) = hex_resolution[0:2]
            prefix = 'hex_' + str(hex_distance)
            obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv'
            obs_file = os.path.join(root_dir, prefix, obs_file)
            for k in k_values:
                prd_file = '_'.join(
                    (prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv'
                prd_file = os.path.join(root_dir, prefix, prd_file)
                r = RiemannComparison(prefix, obs_file, prd_file, hex_id_field,
                                      k)
                compare_list.append(r)

        # Add the plot_pixel comparisons to this list
        prefix = 'plot_pixel'
        obs_file = 'plot_pixel_observed.csv'
        obs_file = os.path.join(root_dir, prefix, obs_file)
        for k in k_values:
            prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
            prd_file = os.path.join(root_dir, prefix, prd_file)
            r = RiemannComparison(prefix, obs_file, prd_file, id_field, k)
            compare_list.append(r)

        # Do all the comparisons
        for c in compare_list:

            # Open the observed file
            obs_data = utilities.csv2rec(c.obs_file)

            # Open the predicted file
            prd_data = utilities.csv2rec(c.prd_file)

            # Ensure that the IDs between the observed and predicted
            # data line up
            ids1 = getattr(obs_data, c.id_field)
            ids2 = getattr(prd_data, c.id_field)
            if np.all(ids1 != ids2):
                err_msg = 'IDs do not match between observed and '
                err_msg += 'predicted data'
                raise ValueError(err_msg)

            for attr in attrs:
                arr1 = getattr(obs_data, attr)
                arr2 = getattr(prd_data, attr)
                rv = RiemannVariable(arr1, arr2)

                gmfr_stats = rv.gmfr_statistics()
                for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                                                        attr, stat.upper(),
                                                        gmfr_stats[stat])
                    stats_fh.write(stat_line)

                ks_stats = rv.ks_statistics()
                for stat in ('ks_max', 'ks_mean'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                                                        attr, stat.upper(),
                                                        ks_stats[stat])
                    stats_fh.write(stat_line)
Exemple #35
0
    def calculate_neighbors_at_ids(self, id_x_year, id_field='FCID'):
        """
        Run ordination model over the list of IDs sent in and return neighbors
        and distances for each plot

        Parameters
        ----------
        id_x_year : dict
            Dictionary of plot IDs to associated imagery year to know what
            year to run the model

        id_field : str
            Name of the ID field - should be either 'FCID' or 'PLTID'.
            Defaults to 'FCID'

        Returns
        -------
        None (neighbor data stored as self attribute)
        """

        # Alias for self.parameter_parser
        p = self.parameter_parser

        # Ensure the parameter parser is not a PROTOTYPE
        if p.parameter_set not in ('FULL', 'MINIMUM'):
            err_msg = 'Parameter set must be "FULL" or "MINIMUM"'
            raise ValueError(err_msg)

        # Get footprint file
        fp_file = p.footprint_file

        # Check ID field
        if id_field not in ('FCID', 'PLTID'):
            err_msg = id_field + ' accuracy assessment is not currently '
            err_msg += 'supported'
            raise NotImplementedError(err_msg)

        # Get a list of the unique IDs
        ids = np.unique(id_x_year.keys())

        # Get a list of the years over which we need to run models
        years = np.unique(id_x_year.values())

        # Create a dictionary of all plots associated with each model year
        year_ids = {}
        for (k, v) in id_x_year.iteritems():
            try:
                year_ids[v].append(k)
            except KeyError:
                year_ids[v] = [k]

        # This section extracts the ordination variable information from the
        # model XML files and creates a dict of year/variable combinations.
        # Once this dict is created, we only need to extract the spatial data
        # from the unique set of values in this dict and use this crosswalk
        # to get to those values.  This should be efficient from GDAL's
        # perspective to avoid cache thrashing.
        #
        # However, because we don't need all ordination variable's values for
        # all plots (ie. temporally varying ordination variables), at this
        # point we only want to extract footprints for those variables that are
        # common across all years.  We track the count of times a variable
        # appears across all lists (raster_counts) and if equal to
        # len(years), we extract footprints at this point.
        #
        # For all other variables, we wait until we have a subset of the coords
        # to extract the spatial data

        ord_year_var_dict = {}
        raster_counts = {}
        raster_dict = {}

        for year in years:
            ord_year_var_dict[year] = {}

            # Get the ordination variables specialized for this year
            ord_vars = p.get_ordination_variables(year)

            for (var, path) in ord_vars:
                # For this year, variable combination, store the path to the
                # variable
                ord_year_var_dict[year][var] = path

                # Record this variable in the counts and push to the raster
                # list if it's a new variable
                try:
                    raster_counts[path] += 1
                except KeyError:
                    ds = gdal.Open(path, gdalconst.GA_ReadOnly)
                    raster_dict[path] = [ds, False]
                    raster_counts[path] = 1

        # Retrieve all coordinates records as a recarray
        coords = utilities.csv2rec(p.coordinate_file)

        # Subset this list to just those plots in the model
        id_arr = getattr(coords, id_field)
        coord_list = coords[np.in1d(id_arr, ids)]

        # Retrieve the footprint configurations.  Footprint offsets store the
        # row and column tuples of each pixel within a given footprint.
        # Footprint windows store the upper left coordinate and window size for
        # extraction from GDAL datasets
        fp_parser = footprint.FootprintParser()
        fp_dict = fp_parser.parse(fp_file)
        fp_offsets = {}
        fp_windows = {}
        for (id, data_source, x, y) in coord_list:
            fp_offsets[id] = fp_dict[data_source].offsets
            fp_windows[id] = fp_dict[data_source].window((x, y))

        # Extract footprint information for every ordination variable that is
        # common to all years and store in a dict keyed by ID and raster
        # file name
        fp_value_dict = {}
        for (fn, count) in raster_counts.iteritems():
            if count == len(years):
                print fn
                ds, processed = raster_dict[fn]

                # Get the footprint window values for this dataset
                fp_values = self.get_footprint_values(ds, fp_windows)

                # Change the flag for this dataset to 'processed'
                raster_dict[fn][1] = True

                # Store these footprint values in a dictionary keyed by
                # id and variable file name
                for (id, fp) in fp_values.iteritems():
                    try:
                        fp_value_dict[id][fn] = fp
                    except KeyError:
                        fp_value_dict[id] = {}
                        fp_value_dict[id][fn] = fp

                # Close this dataset - no longer needed
                raster_dict[fn][0] = None

        # Get the ordination model and read it in
        ord_file = p.get_ordination_file()
        lop = lemma_ordination_parser.LemmaOrdinationParser()
        ord_model = lop.parse(ord_file, delimiter=',')

        # Create the imputation model based on the ordination model and the
        # imputation parameters
        imp_model = im.ImputationModel(ord_model, n_axes=p.number_axes,
            use_weightings=p.use_axis_weighting, max_neighbors=p.max_neighbors)

        # Main loop to iterate over all years
        for year in years:
            print year

            # Get the subset of footprint offsets and windows for this year
            offsets = dict((x, fp_offsets[x]) for x in year_ids[year])
            windows = dict((x, fp_windows[x]) for x in year_ids[year])

            # Extract footprints for any variables that are not common to all
            # years, but specialized for this year
            for (var, fn) in ord_year_var_dict[year].iteritems():
                ds, processed = raster_dict[fn]
                if not processed:
                    print fn

                    # Extract footprint values for this dataset
                    fp_values = self.get_footprint_values(ds, windows)

                    # Set the processed flag to True
                    raster_dict[fn][1] = True

                    # Store these values
                    for (id, fp) in fp_values.iteritems():
                        try:
                            fp_value_dict[id][fn] = fp
                        except:
                            fp_value_dict[id] = {}
                            fp_value_dict[id][fn] = fp

                    # Close the dataset - no longer needed
                    raster_dict[fn][0] = None

            # At this point, we have all the footprint information needed for
            # this year stored in fp_value_dict.  Now, iterate over each plot
            # in this year and run the imputation for each pixel.  Output is
            # captured at the pixel scale (within zonal_pixel_dict) and
            # for each attribute at the plot scale (within predicted_dict).
            for id in sorted(windows.keys()):

                # Get the footprint values for this plot
                fp_values = []
                for var in ord_model.var_names:
                    fn = ord_year_var_dict[year][var]
                    fp_values.append(fp_value_dict[id][fn])

                # Set up an output instance to capture each pixel's neighbors
                # and distances
                obj = NNFootprint(id)

                # Run the imputation for each pixel in the footprint
                for o in offsets[id]:

                    # Get the ordination variable values for this offset
                    # Store in (1xv) array
                    v = np.array(self.get_values_from_offset(fp_values, o))
                    v = v[np.newaxis, :]

                    # Run the imputation
                    nn_ids, nn_dists = imp_model.get_neighbors(v, id=id)

                    # Append this pixel to the NNFootprint object
                    obj.append(NNPixel(nn_ids, nn_dists))

                # Store the neighbor information
                self.neighbor_data[id] = copy.deepcopy(obj)
Exemple #36
0
    def run(self):
        # Convert the species and environment matrices to numpy rec arrays
        spp_ra = utilities.csv2rec(self.spp_file)
        env_ra = utilities.csv2rec(self.env_file)

        # Extract the plot IDs from both the species and environment matrices
        # and ensure that they are equal
        spp_plot_ids = getattr(spp_ra, self.id_field)
        env_plot_ids = getattr(env_ra, self.id_field)
        if not np.all(spp_plot_ids == env_plot_ids):
            err_msg = 'Species and environment plot IDs do not match'
            raise ValueError(err_msg)

        # Drop the ID column from both arrays
        spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field])
        env_ra = mlab.rec_drop_fields(env_ra, [self.id_field])

        # For the environment matrix, only keep the variables specified
        env_ra = mlab.rec_keep_fields(env_ra, self.variables)

        # Convert these matrices to pure floating point arrays
        spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T
        env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T

        # Apply transformation if desired
        if self.species_transform == 'SQRT':
            spp = np.sqrt(spp)
        elif self.species_transform == 'LOG':
            spp = np.log(spp)

        # Create the RDA object
        cca = numpy_ordination.NumpyRDA(spp, env)

        # Open the output file
        numpy_fh = open(self.ord_file, 'w')

        # Eigenvalues
        numpy_fh.write('### Eigenvalues ###\n')
        for (i, e) in enumerate(cca.eigenvalues):
            numpy_fh.write('RDA' + str(i + 1) + ',' + '%.10f' % e + '\n')
        numpy_fh.write('\n')

        # Print out variable means
        numpy_fh.write('### Variable Means ###\n')
        for (i, m) in enumerate(cca.env_means):
            numpy_fh.write('%s,%.10f\n' % (self.variables[i], m))
        numpy_fh.write('\n')

        # Print out environmental coefficients loadings
        numpy_fh.write('### Coefficient Loadings ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('VARIABLE,' + header_str + '\n')
        for (i, c) in enumerate(cca.coefficients()):
            coeff = ','.join(['%.10f' % x for x in c])
            numpy_fh.write('%s,%s\n' % (self.variables[i], coeff))
        numpy_fh.write('\n')

        # Print out biplot scores
        numpy_fh.write('### Biplot Scores ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('VARIABLE,' + header_str + '\n')
        for (i, b) in enumerate(cca.biplot_scores()):
            scores = ','.join(['%.10f' % x for x in b])
            numpy_fh.write('%s,%s\n' % (self.variables[i], scores))
        numpy_fh.write('\n')

        # Print out species centroids
        numpy_fh.write('### Species Centroids ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('SPECIES,' + header_str + '\n')
        for (i, c) in enumerate(cca.species_centroids()):
            scores = ','.join(['%.10f' % x for x in c])
            numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores))
        numpy_fh.write('\n')

        # Print out species tolerances
        numpy_fh.write('### Species Tolerances ###\n')
        header_str = \
            ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('SPECIES,' + header_str + '\n')
        for (i, t) in enumerate(cca.species_tolerances()):
            scores = ','.join(['%.21f' % x for x in t])
            numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores))
        numpy_fh.write('\n')

        # Print out miscellaneous species information
        numpy_fh.write('### Miscellaneous Species Information ###\n')
        numpy_fh.write('SPECIES,WEIGHT,N2\n')
        species_weights, species_n2 = cca.species_information()
        for i in xrange(len(species_weights)):
            numpy_fh.write(
                '%s,%.10f,%.10f\n' %
                (spp_ra.dtype.names[i], species_weights[i], species_n2[i]))
        numpy_fh.write('\n')

        # Print out site LC scores
        numpy_fh.write('### Site LC Scores ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('ID,' + header_str + '\n')
        for (i, s) in enumerate(cca.site_lc_scores()):
            scores = ','.join(['%.10f' % x for x in s])
            numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores))
        numpy_fh.write('\n')

        # Print out site WA scores
        numpy_fh.write('### Site WA Scores ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('ID,' + header_str + '\n')
        for (i, s) in enumerate(cca.site_wa_scores()):
            scores = ','.join(['%.10f' % x for x in s])
            numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores))
        numpy_fh.write('\n')

        # Miscellaneous site information
        numpy_fh.write('### Miscellaneous Site Information ###\n')
        numpy_fh.write('ID,WEIGHT,N2\n')
        site_weights, site_n2 = cca.site_information()
        for i in xrange(len(site_weights)):
            numpy_fh.write('%s,%.10f,%.10f\n' %
                           (spp_plot_ids[i], site_weights[i], site_n2[i]))

        # Close the file
        numpy_fh.close()
    def run_diagnostic(self):

        # Open the stats file and print out the header line
        stats_fh = open(self.statistics_file, 'w')
        out_list = [
            'VARIABLE',
            'PEARSON_R',
            'SPEARMAN_R',
            'RMSE',
            'NORMALIZED_RMSE',
            'BIAS_PERCENTAGE',
            'R_SQUARE',
        ]
        stats_fh.write(','.join(out_list) + '\n')

        # Read the observed and predicted files into numpy recarrays
        obs = utilities.csv2rec(self.observed_file)
        prd = utilities.csv2rec(self.predicted_file)

        # Subset the observed data just to the IDs that are in the
        # predicted file
        obs_keep = np.in1d(
            getattr(obs, self.id_field), getattr(prd, self.id_field))
        obs = obs[obs_keep]

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # For each variable, calculate the statistics
        for v in obs.dtype.names:

            # Get the metadata for this field
            try:
                fm = mp.get_attribute(v)
            except:
                err_msg = v + ' is missing metadata.'
                print err_msg
                continue

            # Only continue if this is a continuous accuracy variable
            if fm.field_type != 'CONTINUOUS' or fm.accuracy_attr == 0:
                continue

            obs_vals = getattr(obs, v)
            prd_vals = getattr(prd, v)

            if np.all(obs_vals == 0.0):
                pearson_r = 0.0
                spearman_r = 0.0
                rmse = 0.0
                std_rmse = 0.0
                bias = 0.0
                r2 = 0.0
            else:
                if np.all(prd_vals == 0.0):
                    pearson_r = 0.0
                    spearman_r = 0.0
                else:
                    pearson_r = statistics.pearson_r(obs_vals, prd_vals)
                    spearman_r = statistics.spearman_r(obs_vals, prd_vals)
                rmse = statistics.rmse(obs_vals, prd_vals)
                std_rmse = rmse / obs_vals.mean()
                bias = statistics.bias_percentage(obs_vals, prd_vals)
                r2 = statistics.r2(obs_vals, prd_vals)

            # Print this out to the stats file
            out_list = [
                v,
                '%.6f' % pearson_r,
                '%.6f' % spearman_r,
                '%.6f' % rmse,
                '%.6f' % std_rmse,
                '%.6f' % bias,
                '%.6f' % r2,
            ]
            stats_fh.write(','.join(out_list) + '\n')
        stats_fh.close()
Exemple #38
0
    def run_diagnostic(self):

        # Open the stats file and print out the header line
        stats_fh = open(self.statistics_file, 'w')
        out_list = [
            'VARIABLE',
            'PEARSON_R',
            'SPEARMAN_R',
            'RMSE',
            'NORMALIZED_RMSE',
            'BIAS_PERCENTAGE',
            'R_SQUARE',
        ]
        stats_fh.write(','.join(out_list) + '\n')

        # Read the observed and predicted files into numpy recarrays
        obs = utilities.csv2rec(self.observed_file)
        prd = utilities.csv2rec(self.predicted_file)

        # Subset the observed data just to the IDs that are in the
        # predicted file
        obs_keep = np.in1d(getattr(obs, self.id_field),
                           getattr(prd, self.id_field))
        obs = obs[obs_keep]

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # For each variable, calculate the statistics
        for v in obs.dtype.names:

            # Get the metadata for this field
            try:
                fm = mp.get_attribute(v)
            except:
                err_msg = v + ' is missing metadata.'
                print err_msg
                continue

            # Only continue if this is a continuous accuracy variable
            if fm.field_type != 'CONTINUOUS' or fm.accuracy_attr == 0:
                continue

            obs_vals = getattr(obs, v)
            prd_vals = getattr(prd, v)

            if np.all(obs_vals == 0.0):
                pearson_r = 0.0
                spearman_r = 0.0
                rmse = 0.0
                std_rmse = 0.0
                bias = 0.0
                r2 = 0.0
            else:
                if np.all(prd_vals == 0.0):
                    pearson_r = 0.0
                    spearman_r = 0.0
                else:
                    pearson_r = statistics.pearson_r(obs_vals, prd_vals)
                    spearman_r = statistics.spearman_r(obs_vals, prd_vals)
                rmse = statistics.rmse(obs_vals, prd_vals)
                std_rmse = rmse / obs_vals.mean()
                bias = statistics.bias_percentage(obs_vals, prd_vals)
                r2 = statistics.r2(obs_vals, prd_vals)

            # Print this out to the stats file
            out_list = [
                v,
                '%.6f' % pearson_r,
                '%.6f' % spearman_r,
                '%.6f' % rmse,
                '%.6f' % std_rmse,
                '%.6f' % bias,
                '%.6f' % r2,
            ]
            stats_fh.write(','.join(out_list) + '\n')
        stats_fh.close()
    def run_diagnostic(self):

        # Shortcut to the parameter parser
        p = self.parameter_parser

        # ID field
        id_field = p.summary_level + 'ID'

        # Root directory for Riemann files
        root_dir = p.riemann_output_folder

        # Read in hex input file
        obs_data = utilities.csv2rec(self.hex_attribute_file)

        # Get the hexagon levels and ensure that the fields exist in the
        # hex_attribute file
        hex_resolutions = p.riemann_hex_resolutions
        hex_fields = [x[0] for x in hex_resolutions]
        for field in hex_fields:
            if field not in obs_data.dtype.names:
                err_msg = 'Field ' + field + ' does not exist in the '
                err_msg += 'hex_attribute file'
                raise ValueError(err_msg)

        # Create the directory structure based on the hex levels
        hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions]
        all_levels = ['plot_pixel'] + hex_levels
        for level in all_levels:
            sub_dir = os.path.join(root_dir, level)
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)

        # Get the values of k
        k_values = p.riemann_k_values

        # Create a dictionary of plot ID to image year (or model_year for
        # non-imagery models) for these plots
        if p.model_type in p.imagery_model_types:
            id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data)
        else:
            id_x_year = dict((x[id_field], p.model_year) for x in obs_data)

        # Create a PredictionRun instance
        pr = prediction_run.PredictionRun(p)

        # Get the neighbors and distances for these IDs
        pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field)

        # Create the lookup of id_field to LOC_ID for the hex plots
        nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data)

        # Create a dictionary between id_field and no_self_assign_field
        # for the model plots
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)
        model_nsa_id_dict = dict((getattr(x, id_field), x.LOC_ID)
            for x in env_data)

        # Stitch the two dictionaries together
        for id in sorted(model_nsa_id_dict.keys()):
            if id not in nsa_id_dict:
                nsa_id_dict[id] = model_nsa_id_dict[id]

        # Get the stand attribute metadata and retrieve only the
        # continuous accuracy attributes
        stand_metadata_file = p.stand_metadata_file
        mp = xsmp.XMLStandMetadataParser(stand_metadata_file)
        attrs = [x.field_name for x in mp.attributes
            if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1]

        # Subset the attributes for fields that are in the
        # hex_attribute file
        attrs = [x for x in attrs if x in obs_data.dtype.names]
        plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs)

        # Write out the plot_pixel observed file
        file_name = 'plot_pixel_observed.csv'
        output_file = os.path.join(root_dir, 'plot_pixel', file_name)
        utilities.rec2csv(plot_pixel_obs, output_file)

        # Iterate over values of k
        for k in k_values:

            # Construct the output file name
            file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k)))
            file_name += '.csv'
            output_file = os.path.join(root_dir, 'plot_pixel', file_name)
            out_fh = open(output_file, 'w')

            # For the plot/pixel scale, retrieve the independent predicted
            # data for this value of k.  Even though attributes are being
            # returned from this function, we want to use the attribute list
            # that we've already found above.
            prediction_generator = pr.calculate_predictions_at_k(
                k=k, id_field=id_field, independent=True,
                nsa_id_dict=nsa_id_dict)

            # Write out the field names
            out_fh.write(id_field + ',' + ','.join(attrs) + '\n')

            # Write out the predictions for this k
            for plot_prediction in prediction_generator:

                # Write this record to the predicted attribute file
                pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs)

            # Close this file
            out_fh.close()

        # Create the fields for which to extract statistics at the hexagon
        # levels
        mean_fields = [(id_field, len, 'PLOT_COUNT')]
        mean_fields.extend([(x, np.mean, x) for x in attrs])
        mean_fields = tuple(mean_fields)

        sd_fields = [(id_field, len, 'PLOT_COUNT')]
        sd_fields.extend([(x, np.std, x) for x in attrs])
        sd_fields = tuple(sd_fields)

        stat_sets = {
            'mean': mean_fields,
            'std': sd_fields,
        }

        # For each hexagon level, associate the plots with their hexagon ID
        # and find observed and predicted statistics for each hexagon
        for hex_resolution in hex_resolutions:

            (hex_id_field, hex_distance) = hex_resolution[0:2]
            min_plots_per_hex = hex_resolution[3]
            prefix = 'hex_' + str(hex_distance)

            # Create a crosswalk between the id_field and the hex_id_field
            id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field])

            # Iterate over all sets of statistics and write a unique file
            # for each set
            for (stat_name, stat_fields) in stat_sets.iteritems():

                # Get the output file name
                obs_out_file = \
                    '_'.join((prefix, 'observed', stat_name)) + '.csv'
                obs_out_file = os.path.join(root_dir, prefix, obs_out_file)

                # Write out the observed file
                self.write_hex_stats(obs_data, hex_id_field, stat_fields,
                    min_plots_per_hex, obs_out_file)

            # Iterate over values of k for the predicted values
            for k in k_values:

                # Open the plot_pixel predicted file for this value of k
                # and join the hex_id_field to the recarray
                prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
                prd_file = os.path.join(root_dir, 'plot_pixel', prd_file)
                prd_data = utilities.csv2rec(prd_file)
                prd_data = mlab.rec_join(id_field, prd_data, id_x_hex)

                # Iterate over all sets of statistics and write a unique file
                # for each set
                for (stat_name, stat_fields) in stat_sets.iteritems():

                    # Get the output file name
                    prd_out_file = '_'.join((
                        prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv'
                    prd_out_file = os.path.join(root_dir, prefix, prd_out_file)

                    # Write out the predicted file
                    self.write_hex_stats(prd_data, hex_id_field, stat_fields,
                        min_plots_per_hex, prd_out_file)

        # Calculate the ECDF and AC statistics
        # For ECDF and AC, it is a paired comparison between the observed
        # and predicted data.  We do this at each value of k and for each
        # hex resolution level.

        # Open the stats file
        stats_file = p.hex_statistics_file
        stats_fh = open(stats_file, 'w')
        header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE']
        stats_fh.write(','.join(header_fields) + '\n')

        # Create a list of RiemannComparison instances which store the
        # information needed to do comparisons between observed and predicted
        # files for any level or value of k
        compare_list = []
        for hex_resolution in hex_resolutions:
            (hex_id_field, hex_distance) = hex_resolution[0:2]
            prefix = 'hex_' + str(hex_distance)
            obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv'
            obs_file = os.path.join(root_dir, prefix, obs_file)
            for k in k_values:
                prd_file = '_'.join((
                    prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv'
                prd_file = os.path.join(root_dir, prefix, prd_file)
                r = RiemannComparison(
                    prefix, obs_file, prd_file, hex_id_field, k)
                compare_list.append(r)

        # Add the plot_pixel comparisons to this list
        prefix = 'plot_pixel'
        obs_file = 'plot_pixel_observed.csv'
        obs_file = os.path.join(root_dir, prefix, obs_file)
        for k in k_values:
            prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
            prd_file = os.path.join(root_dir, prefix, prd_file)
            r = RiemannComparison(prefix, obs_file, prd_file, id_field, k)
            compare_list.append(r)

        # Do all the comparisons
        for c in compare_list:

            # Open the observed file
            obs_data = utilities.csv2rec(c.obs_file)

            # Open the predicted file
            prd_data = utilities.csv2rec(c.prd_file)

            # Ensure that the IDs between the observed and predicted
            # data line up
            ids1 = getattr(obs_data, c.id_field)
            ids2 = getattr(prd_data, c.id_field)
            if np.all(ids1 != ids2):
                err_msg = 'IDs do not match between observed and '
                err_msg += 'predicted data'
                raise ValueError(err_msg)

            for attr in attrs:
                arr1 = getattr(obs_data, attr)
                arr2 = getattr(prd_data, attr)
                rv = RiemannVariable(arr1, arr2)

                gmfr_stats = rv.gmfr_statistics()
                for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                        attr, stat.upper(), gmfr_stats[stat])
                    stats_fh.write(stat_line)

                ks_stats = rv.ks_statistics()
                for stat in ('ks_max', 'ks_mean'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                        attr, stat.upper(), ks_stats[stat])
                    stats_fh.write(stat_line)
Exemple #40
0
    def _create_story(self):

        # Set up an empty list to hold the story
        story = []

        # Import the report styles
        styles = report_styles.get_report_styles()

        # Create a page break
        story = self._make_page_break(story, self.PORTRAIT)

        # Section title
        title_str = '<strong>Local-Scale Accuracy Assessment:<br/>'
        title_str += 'Species Accuracy at Plot Locations'
        title_str += '</strong>'

        para = p.Paragraph(title_str, styles['section_style'])
        t = p.Table([[para]], colWidths=[7.5 * u.inch])
        t.setStyle(
            p.TableStyle([
                ('TOPPADDING', (0, 0), (-1, -1), 6),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
                ('BACKGROUND', (0, 0), (-1, -1), '#957348'),
                ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
            ]))
        story.append(t)
        story.append(p.Spacer(0, 0.2 * u.inch))

        # Kappa explanation
        kappa_str = '''
            Cohen's kappa coefficient (Cohen, 1960) is a statistical measure
            of reliability, accounting for agreement occurring by chance.  
            The equation for kappa is: 
        '''
        para = p.Paragraph(kappa_str, styles['body_style'])
        story.append(para)
        story.append(p.Spacer(0, 0.05 * u.inch))

        kappa_str = '''
           kappa = (Pr(a) - Pr(e)) / (1.0 - Pr(e))
        '''
        para = p.Paragraph(kappa_str, styles['indented'])
        story.append(para)
        story.append(p.Spacer(0, 0.05 * u.inch))

        kappa_str = '''
            where Pr(a) is the relative observed agreement among
            raters, and Pr(e) is the probability that agreement is
            due to chance.<br/><br/>

            <strong>Abbreviations Used:</strong><br/>
            OP/PP = Observed Present / Predicted Present<br/>
            OA/PP = Observed Absent / Predicted Present
            (errors of commission)<br/>
            OP/PA = Observed Present / Predicted Absent
            (errors of ommission)<br/>
            OA/PA = Observed Absent / Predicted Absent
        '''
        para = p.Paragraph(kappa_str, styles['body_style'])
        story.append(para)
        story.append(p.Spacer(0, 0.2 * u.inch))

        # Create a list of lists to hold the species accuracy information
        species_table = []

        # Header row
        header_row = []

        spp_str = '<strong>Species PLANTS Code<br/>'
        spp_str += 'Scientific Name / Common Name</strong>'
        para = p.Paragraph(spp_str, styles['body_style_10'])
        header_row.append(para)

        spp_str = '<strong>Species prevalence</strong>'
        para = p.Paragraph(spp_str, styles['body_style_10'])
        header_row.append(para)

        p1 = p.Paragraph('<strong>OP/PP</strong>',
                         styles['body_style_10_right'])
        p2 = p.Paragraph('<strong>OP/PA</strong>',
                         styles['body_style_10_right'])
        p3 = p.Paragraph('<strong>OA/PP</strong>',
                         styles['body_style_10_right'])
        p4 = p.Paragraph('<strong>OA/PA</strong>',
                         styles['body_style_10_right'])
        header_cells = [[p1, p2], [p3, p4]]
        t = p.Table(header_cells, colWidths=[0.75 * u.inch, 0.75 * u.inch])
        t.setStyle(
            p.TableStyle([
                ('GRID', (0, 0), (-1, -1), 1, colors.white),
                ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                ('TOPPADDING', (0, 0), (-1, -1), 2),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
            ]))
        header_row.append(t)

        kappa_str = '<strong>Kappa coefficient</strong>'
        para = p.Paragraph(kappa_str, styles['body_style_10'])
        header_row.append(para)
        species_table.append(header_row)

        # Open the species accuracy file into a recarray
        spp_data = utilities.csv2rec(self.species_accuracy_file)

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # Read in the report metadata if it exists
        if self.report_metadata_file:
            rmp = xrmp.XMLReportMetadataParser(self.report_metadata_file)
        else:
            rmp = None

        # Subset the attributes to just species
        attrs = []
        for attr in mp.attributes:
            if attr.species_attr == 1 and 'NOTALY' not in attr.field_name:
                attrs.append(attr.field_name)

        # Iterate over the species and print out the statistics
        for spp in attrs:

            # Empty row to hold the formatted output
            species_row = []

            # Get the scientific and common names from the report metadata
            # if it exists; otherwise, just use the species symbol
            if rmp is not None:

                # Strip off any suffix if it exists
                try:
                    spp_plain = spp.split('_')[0]
                    spp_info = rmp.get_species(spp_plain)
                    spp_str = spp_info.spp_symbol + '<br/>'
                    spp_str += spp_info.scientific_name + ' / '
                    spp_str += spp_info.common_name
                except IndexError:
                    spp_str = spp
            else:
                spp_str = spp
            para = p.Paragraph(spp_str, styles['body_style_10'])
            species_row.append(para)

            # Get the statistical information
            data = spp_data[spp_data.SPECIES == spp][0]
            counts = [data.OP_PP, data.OP_PA, data.OA_PP, data.OA_PA]
            prevalence = data.PREVALENCE
            kappa = data.KAPPA

            # Species prevalence
            prevalence_str = '%.4f' % prevalence
            para = p.Paragraph(prevalence_str, styles['body_style_10_right'])
            species_row.append(para)

            # Capture the plot counts in an inner table
            count_cells = []
            count_row = []
            for i in range(0, 4):
                para = p.Paragraph('%d' % counts[i],
                                   styles['body_style_10_right'])
                count_row.append(para)
                if i % 2 == 1:
                    count_cells.append(count_row)
                    count_row = []
            t = p.Table(count_cells, colWidths=[0.75 * u.inch, 0.75 * u.inch])
            t.setStyle(
                p.TableStyle([
                    ('GRID', (0, 0), (-1, -1), 1, colors.white),
                    ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'),
                    ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                    ('TOPPADDING', (0, 0), (-1, -1), 2),
                    ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
                ]))
            species_row.append(t)

            # Print out the kappa statistic
            kappa_str = '%.4f' % kappa
            para = p.Paragraph(kappa_str, styles['body_style_10_right'])
            species_row.append(para)

            # Push this row to the master species table
            species_table.append(species_row)

        # Style this into a reportlab table and add to the story
        col_widths = [(x * u.inch) for x in [4.0, 0.75, 1.5, 0.75]]
        t = p.Table(species_table, colWidths=col_widths)
        t.setStyle(
            p.TableStyle([
                ('BACKGROUND', (0, 0), (-1, -1), '#f1efe4'),
                ('GRID', (0, 0), (-1, -1), 2, colors.white),
                ('TOPPADDING', (0, 0), (0, -1), 2),
                ('BOTTOMPADDING', (0, 0), (0, -1), 2),
                ('LEFTPADDING', (0, 0), (0, -1), 6),
                ('RIGHTPADDING', (0, 0), (0, -1), 6),
                ('ALIGNMENT', (0, 0), (0, -1), 'LEFT'),
                ('VALIGN', (0, 0), (0, -1), 'TOP'),
                ('TOPPADDING', (1, 0), (1, -1), 2),
                ('BOTTOMPADDING', (1, 0), (1, -1), 2),
                ('LEFTPADDING', (1, 0), (1, -1), 6),
                ('RIGHTPADDING', (1, 0), (1, -1), 6),
                ('ALIGNMENT', (1, 0), (1, -1), 'RIGHT'),
                ('VALIGN', (1, 0), (1, 0), 'TOP'),
                ('VALIGN', (1, 1), (1, -1), 'MIDDLE'),
                ('TOPPADDING', (2, 0), (2, -1), 0),
                ('BOTTOMPADDING', (2, 0), (2, -1), 0),
                ('LEFTPADDING', (2, 0), (2, -1), 0),
                ('RIGHTPADDING', (2, 0), (2, -1), 0),
                ('ALIGNMENT', (2, 0), (2, -1), 'LEFT'),
                ('VALIGN', (2, 0), (2, -1), 'TOP'),
                ('TOPPADDING', (3, 0), (3, -1), 2),
                ('BOTTOMPADDING', (3, 0), (3, -1), 2),
                ('LEFTPADDING', (3, 0), (3, -1), 6),
                ('RIGHTPADDING', (3, 0), (3, -1), 6),
                ('ALIGNMENT', (3, 0), (3, -1), 'RIGHT'),
                ('VALIGN', (3, 0), (3, 0), 'TOP'),
                ('VALIGN', (3, 1), (3, -1), 'MIDDLE'),
            ]))
        story.append(t)
        story.append(p.Spacer(0, 0.1 * u.inch))

        rare_species_str = """
            Note that some very rare species do not appear in this accuracy
            report, because these species were not included when building
            the initial ordination model.  The full set of species is
            available upon request.
        """
        para = p.Paragraph(rare_species_str, styles['body_style'])
        story.append(para)

        # Return this story
        return story
Exemple #41
0
    def run(self):
        # Convert the species and environment matrices to numpy rec arrays
        spp_ra = utilities.csv2rec(self.spp_file)
        env_ra = utilities.csv2rec(self.env_file)

        # Extract the plot IDs from both the species and environment matrices
        # and ensure that they are equal
        spp_plot_ids = getattr(spp_ra, self.id_field)
        env_plot_ids = getattr(env_ra, self.id_field)
        if not np.all(spp_plot_ids == env_plot_ids):
            err_msg = "Species and environment plot IDs do not match"
            raise ValueError(err_msg)

        # Drop the ID column from both arrays
        spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field])
        env_ra = mlab.rec_drop_fields(env_ra, [self.id_field])

        # For the environment matrix, only keep the variables specified
        env_ra = mlab.rec_keep_fields(env_ra, self.variables)

        # Convert these matrices to pure floating point arrays
        spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T
        env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T

        # Apply transformation if desired
        if self.species_transform == "SQRT":
            spp = np.sqrt(spp)
        elif self.species_transform == "LOG":
            spp = np.log(spp)

        # Create the RDA object
        cca = numpy_ordination.NumpyRDA(spp, env)

        # Open the output file
        numpy_fh = open(self.ord_file, "w")

        # Eigenvalues
        numpy_fh.write("### Eigenvalues ###\n")
        for (i, e) in enumerate(cca.eigenvalues):
            numpy_fh.write("RDA" + str(i + 1) + "," + "%.10f" % e + "\n")
        numpy_fh.write("\n")

        # Print out variable means
        numpy_fh.write("### Variable Means ###\n")
        for (i, m) in enumerate(cca.env_means):
            numpy_fh.write("%s,%.10f\n" % (self.variables[i], m))
        numpy_fh.write("\n")

        # Print out environmental coefficients loadings
        numpy_fh.write("### Coefficient Loadings ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("VARIABLE," + header_str + "\n")
        for (i, c) in enumerate(cca.coefficients()):
            coeff = ",".join(["%.10f" % x for x in c])
            numpy_fh.write("%s,%s\n" % (self.variables[i], coeff))
        numpy_fh.write("\n")

        # Print out biplot scores
        numpy_fh.write("### Biplot Scores ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("VARIABLE," + header_str + "\n")
        for (i, b) in enumerate(cca.biplot_scores()):
            scores = ",".join(["%.10f" % x for x in b])
            numpy_fh.write("%s,%s\n" % (self.variables[i], scores))
        numpy_fh.write("\n")

        # Print out species centroids
        numpy_fh.write("### Species Centroids ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("SPECIES," + header_str + "\n")
        for (i, c) in enumerate(cca.species_centroids()):
            scores = ",".join(["%.10f" % x for x in c])
            numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores))
        numpy_fh.write("\n")

        # Print out species tolerances
        numpy_fh.write("### Species Tolerances ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("SPECIES," + header_str + "\n")
        for (i, t) in enumerate(cca.species_tolerances()):
            scores = ",".join(["%.21f" % x for x in t])
            numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores))
        numpy_fh.write("\n")

        # Print out miscellaneous species information
        numpy_fh.write("### Miscellaneous Species Information ###\n")
        numpy_fh.write("SPECIES,WEIGHT,N2\n")
        species_weights, species_n2 = cca.species_information()
        for i in xrange(len(species_weights)):
            numpy_fh.write("%s,%.10f,%.10f\n" % (spp_ra.dtype.names[i], species_weights[i], species_n2[i]))
        numpy_fh.write("\n")

        # Print out site LC scores
        numpy_fh.write("### Site LC Scores ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("ID," + header_str + "\n")
        for (i, s) in enumerate(cca.site_lc_scores()):
            scores = ",".join(["%.10f" % x for x in s])
            numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores))
        numpy_fh.write("\n")

        # Print out site WA scores
        numpy_fh.write("### Site WA Scores ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("ID," + header_str + "\n")
        for (i, s) in enumerate(cca.site_wa_scores()):
            scores = ",".join(["%.10f" % x for x in s])
            numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores))
        numpy_fh.write("\n")

        # Miscellaneous site information
        numpy_fh.write("### Miscellaneous Site Information ###\n")
        numpy_fh.write("ID,WEIGHT,N2\n")
        site_weights, site_n2 = cca.site_information()
        for i in xrange(len(site_weights)):
            numpy_fh.write("%s,%.10f,%.10f\n" % (spp_plot_ids[i], site_weights[i], site_n2[i]))

        # Close the file
        numpy_fh.close()
Exemple #42
0
 def _read_id_list_file(self, id_list_file):
     data = utilities.csv2rec(id_list_file)
     return ','.join([str(x[0]) for x in data])
Exemple #43
0
 def _read_id_list_file(self, id_list_file):
     data = utilities.csv2rec(id_list_file)
     return ','.join([str(x[0]) for x in data])
    def run_diagnostic(self):
        # Read the observed and predicted files into numpy recarrays
        obs = utilities.csv2rec(self.observed_file)
        prd = utilities.csv2rec(self.predicted_file)

        # Subset the observed data just to the IDs that are in the
        # predicted file
        obs_keep = np.in1d(getattr(obs, self.id_field),
                           getattr(prd, self.id_field))
        obs = obs[obs_keep]

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # Open the stats file and print out the header lines
        stats_fh = open(self.statistics_file, 'w')
        out_list = [
            'SPECIES',
            'OP_PP',
            'OP_PA',
            'OA_PP',
            'OA_PA',
            'PREVALENCE',
            'SENSITIVITY',
            'FALSE_NEGATIVE_RATE',
            'SPECIFICITY',
            'FALSE_POSITIVE_RATE',
            'PERCENT_CORRECT',
            'ODDS_RATIO',
            'KAPPA',
        ]
        stats_fh.write(','.join(out_list) + '\n')

        # For each variable, calculate the statistics
        for v in obs.dtype.names:

            # Get the metadata for this field
            try:
                fm = mp.get_attribute(v)
            except:
                err_msg = v + ' is missing metadata.'
                print err_msg
                continue

            # Only continue if this is a continuous species variable
            if fm.field_type != 'CONTINUOUS' or fm.species_attr == 0:
                continue

            obs_vals = getattr(obs, v)
            prd_vals = getattr(prd, v)

            # Create a binary error matrix from the obs and prd data
            stats = statistics.BinaryErrorMatrix(obs_vals, prd_vals)
            counts = stats.counts()

            # Build the list of items for printing
            out_list = [
                v,
                '%d' % counts[0, 0],
                '%d' % counts[0, 1],
                '%d' % counts[1, 0],
                '%d' % counts[1, 1],
                '%.4f' % stats.prevalence(),
                '%.4f' % stats.sensitivity(),
                '%.4f' % stats.false_negative_rate(),
                '%.4f' % stats.specificity(),
                '%.4f' % stats.false_positive_rate(),
                '%.4f' % stats.percent_correct(),
                '%.4f' % stats.odds_ratio(),
                '%.4f' % stats.kappa(),
            ]
            stats_fh.write(','.join(out_list) + '\n')

        stats_fh.close()
Exemple #45
0
    def calculate_neighbors_at_ids(self, id_x_year, id_field='FCID'):
        """
        Run ordination model over the list of IDs sent in and return neighbors
        and distances for each plot

        Parameters
        ----------
        id_x_year : dict
            Dictionary of plot IDs to associated imagery year to know what
            year to run the model

        id_field : str
            Name of the ID field - should be either 'FCID' or 'PLTID'.
            Defaults to 'FCID'

        Returns
        -------
        None (neighbor data stored as self attribute)
        """

        # Alias for self.parameter_parser
        p = self.parameter_parser

        # Ensure the parameter parser is not a PROTOTYPE
        if p.parameter_set not in ('FULL', 'MINIMUM'):
            err_msg = 'Parameter set must be "FULL" or "MINIMUM"'
            raise ValueError(err_msg)

        # Get footprint file
        fp_file = p.footprint_file

        # Check ID field
        if id_field not in ('FCID', 'PLTID'):
            err_msg = id_field + ' accuracy assessment is not currently '
            err_msg += 'supported'
            raise NotImplementedError(err_msg)

        # Get a list of the unique IDs
        ids = np.unique(id_x_year.keys())

        # Get a list of the years over which we need to run models
        years = np.unique(id_x_year.values())

        # Create a dictionary of all plots associated with each model year
        year_ids = {}
        for (k, v) in id_x_year.iteritems():
            try:
                year_ids[v].append(k)
            except KeyError:
                year_ids[v] = [k]

        # This section extracts the ordination variable information from the
        # model XML files and creates a dict of year/variable combinations.
        # Once this dict is created, we only need to extract the spatial data
        # from the unique set of values in this dict and use this crosswalk
        # to get to those values.  This should be efficient from GDAL's
        # perspective to avoid cache thrashing.
        #
        # However, because we don't need all ordination variable's values for
        # all plots (ie. temporally varying ordination variables), at this
        # point we only want to extract footprints for those variables that are
        # common across all years.  We track the count of times a variable
        # appears across all lists (raster_counts) and if equal to
        # len(years), we extract footprints at this point.
        #
        # For all other variables, we wait until we have a subset of the coords
        # to extract the spatial data

        ord_year_var_dict = {}
        raster_counts = {}
        raster_dict = {}

        for year in years:
            ord_year_var_dict[year] = {}

            # Get the ordination variables specialized for this year
            ord_vars = p.get_ordination_variables(year)

            for (var, path) in ord_vars:
                # For this year, variable combination, store the path to the
                # variable
                ord_year_var_dict[year][var] = path

                # Record this variable in the counts and push to the raster
                # list if it's a new variable
                try:
                    raster_counts[path] += 1
                except KeyError:
                    ds = gdal.Open(path, gdalconst.GA_ReadOnly)
                    raster_dict[path] = [ds, False]
                    raster_counts[path] = 1

        # Retrieve all coordinates records as a recarray
        coords = utilities.csv2rec(p.coordinate_file)

        # Subset this list to just those plots in the model
        id_arr = getattr(coords, id_field)
        coord_list = coords[np.in1d(id_arr, ids)]

        # Retrieve the footprint configurations.  Footprint offsets store the
        # row and column tuples of each pixel within a given footprint.
        # Footprint windows store the upper left coordinate and window size for
        # extraction from GDAL datasets
        fp_parser = footprint.FootprintParser()
        fp_dict = fp_parser.parse(fp_file)
        fp_offsets = {}
        fp_windows = {}
        for (id, data_source, x, y) in coord_list:
            fp_offsets[id] = fp_dict[data_source].offsets
            fp_windows[id] = fp_dict[data_source].window((x, y))

        # Extract footprint information for every ordination variable that is
        # common to all years and store in a dict keyed by ID and raster
        # file name
        fp_value_dict = {}
        for (fn, count) in raster_counts.iteritems():
            if count == len(years):
                print fn
                ds, processed = raster_dict[fn]

                # Get the footprint window values for this dataset
                fp_values = self.get_footprint_values(ds, fp_windows)

                # Change the flag for this dataset to 'processed'
                raster_dict[fn][1] = True

                # Store these footprint values in a dictionary keyed by
                # id and variable file name
                for (id, fp) in fp_values.iteritems():
                    try:
                        fp_value_dict[id][fn] = fp
                    except KeyError:
                        fp_value_dict[id] = {}
                        fp_value_dict[id][fn] = fp

                # Close this dataset - no longer needed
                raster_dict[fn][0] = None

        # Get the ordination model and read it in
        ord_file = p.get_ordination_file()
        lop = lemma_ordination_parser.LemmaOrdinationParser()
        ord_model = lop.parse(ord_file, delimiter=',')

        # Create the imputation model based on the ordination model and the
        # imputation parameters
        imp_model = im.ImputationModel(ord_model,
                                       n_axes=p.number_axes,
                                       use_weightings=p.use_axis_weighting,
                                       max_neighbors=p.max_neighbors)

        # Main loop to iterate over all years
        for year in years:
            print year

            # Get the subset of footprint offsets and windows for this year
            offsets = dict((x, fp_offsets[x]) for x in year_ids[year])
            windows = dict((x, fp_windows[x]) for x in year_ids[year])

            # Extract footprints for any variables that are not common to all
            # years, but specialized for this year
            for (var, fn) in ord_year_var_dict[year].iteritems():
                ds, processed = raster_dict[fn]
                if not processed:
                    print fn

                    # Extract footprint values for this dataset
                    fp_values = self.get_footprint_values(ds, windows)

                    # Set the processed flag to True
                    raster_dict[fn][1] = True

                    # Store these values
                    for (id, fp) in fp_values.iteritems():
                        try:
                            fp_value_dict[id][fn] = fp
                        except:
                            fp_value_dict[id] = {}
                            fp_value_dict[id][fn] = fp

                    # Close the dataset - no longer needed
                    raster_dict[fn][0] = None

            # At this point, we have all the footprint information needed for
            # this year stored in fp_value_dict.  Now, iterate over each plot
            # in this year and run the imputation for each pixel.  Output is
            # captured at the pixel scale (within zonal_pixel_dict) and
            # for each attribute at the plot scale (within predicted_dict).
            for id in sorted(windows.keys()):

                # Get the footprint values for this plot
                fp_values = []
                for var in ord_model.var_names:
                    fn = ord_year_var_dict[year][var]
                    fp_values.append(fp_value_dict[id][fn])

                # Set up an output instance to capture each pixel's neighbors
                # and distances
                obj = NNFootprint(id)

                # Run the imputation for each pixel in the footprint
                for o in offsets[id]:

                    # Get the ordination variable values for this offset
                    # Store in (1xv) array
                    v = np.array(self.get_values_from_offset(fp_values, o))
                    v = v[np.newaxis, :]

                    # Run the imputation
                    nn_ids, nn_dists = imp_model.get_neighbors(v, id=id)

                    # Append this pixel to the NNFootprint object
                    obj.append(NNPixel(nn_ids, nn_dists))

                # Store the neighbor information
                self.neighbor_data[id] = copy.deepcopy(obj)
    def run_diagnostic(self):
        # Read the observed and predicted files into numpy recarrays
        obs = utilities.csv2rec(self.observed_file)
        prd = utilities.csv2rec(self.predicted_file)

        # Subset the observed data just to the IDs that are in the
        # predicted file
        obs_keep = np.in1d(
            getattr(obs, self.id_field), getattr(prd, self.id_field))
        obs = obs[obs_keep]

        # Read in the stand attribute metadata
        mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file)

        # Open the stats file and print out the header lines
        stats_fh = open(self.statistics_file, 'w')
        out_list = [
            'SPECIES',
            'OP_PP',
            'OP_PA',
            'OA_PP',
            'OA_PA',
            'PREVALENCE',
            'SENSITIVITY',
            'FALSE_NEGATIVE_RATE',
            'SPECIFICITY',
            'FALSE_POSITIVE_RATE',
            'PERCENT_CORRECT',
            'ODDS_RATIO',
            'KAPPA',
        ]
        stats_fh.write(','.join(out_list) + '\n')

        # For each variable, calculate the statistics
        for v in obs.dtype.names:

            # Get the metadata for this field
            try:
                fm = mp.get_attribute(v)
            except:
                err_msg = v + ' is missing metadata.'
                print err_msg
                continue

            # Only continue if this is a continuous species variable
            if fm.field_type != 'CONTINUOUS' or fm.species_attr == 0:
                continue

            obs_vals = getattr(obs, v)
            prd_vals = getattr(prd, v)

            # Create a binary error matrix from the obs and prd data
            stats = statistics.BinaryErrorMatrix(obs_vals, prd_vals)
            counts = stats.counts()

            # Build the list of items for printing
            out_list = [
                v,
                '%d' % counts[0, 0],
                '%d' % counts[0, 1],
                '%d' % counts[1, 0],
                '%d' % counts[1, 1],
                '%.4f' % stats.prevalence(),
                '%.4f' % stats.sensitivity(),
                '%.4f' % stats.false_negative_rate(),
                '%.4f' % stats.specificity(),
                '%.4f' % stats.false_positive_rate(),
                '%.4f' % stats.percent_correct(),
                '%.4f' % stats.odds_ratio(),
                '%.4f' % stats.kappa(),
            ]
            stats_fh.write(','.join(out_list) + '\n')

        stats_fh.close()