Esempio n. 1
0
def raw(h5, files, fields=[]):
    """
    Take list of .fits files and store them in the raw group

    fields - list of fields to keep. Use a subset for smaller file size.
    """
    raw = h5.create_group('/raw')
    hduL = []
    kicL = []
    qL = []
    for f in files:
        h = fits.open(f)
        hduL += [h]
        kicL += [h[0].header['KEPLERID']]
        qL += [h[0].header['QUARTER']]

    assert np.unique(kicL).size == 1, 'KEPLERID not the same'
    assert np.unique(qL).size == len(qL), 'duplicate quarters'

    h5.attrs['KEPLERID'] = kicL[0]
    for h, q in zip(hduL, qL):
        r = np.array(h[1].data)
        r = modcols(r)
        raw['Q%i' % q] = r
        if fields != []:
            r = mlab.rec_keep_fields(r, fields)
Esempio n. 2
0
def load_aeronet(fname, keep_fields='all', header=False):
    """loads aeronet lev 2.0 csv file.
    fname: data file name
    keep_fields: 'all' or a list of fields
    header: whether to return header information along with the data.
    """
    std_day = datetime(1900,1,1,0,0,0)
    def date2daynum(datestr):
        the_day = datetime.strptime(datestr, '%d:%m:%Y')
        return float((the_day - std_day).days)

    def time2seconds(timestr):
        h, m, s = [int(t) for t in timestr.split(':')]
        return float(h * 3600 + m * 60 + s)

    def daynum_seconds2datetime(daynum, seconds):
        return std_day + timedelta(days=int(daynum), seconds=int(seconds))

    headlines = []
    f = open(fname, 'r')
    for line_i, line in enumerate(f):
        line = line.rstrip()
        if line.startswith('Date(dd-mm-yy'):
            datefield, timefield = [re.sub(r'\W', '', tk) for tk in line.split(',')[0:2]]
            break
        headlines.append(line)
    skip_header_lines = line_i

    if header:
        headline = ','.join(headlines)
        headerd = dict()
        for attrname, converter in [('location', str), ('long', float), ('lat', float), ('elev', float), ('nmeas', int), ('PI', str), ('email', str)]:
            m = re.search(r'%s.{0,1}=([^,\s]*)' % attrname, headline, flags=re.I)
            if m:
                try:
                    headerd[attrname] = converter(m.group(1))
                except Exception:
                    pass

    rawd = np.genfromtxt(fname, skip_header=skip_header_lines, delimiter=',', names=True, converters={0:date2daynum, 1:time2seconds})
    lend = len(rawd)
    dates = np.zeros(len(rawd), dtype='O')
    for i in range(lend):
        dates[i] = daynum_seconds2datetime(rawd[datefield][i], rawd[timefield][i])

    newd = mlab.rec_append_fields(rawd, 'datetime', dates)
    newd = mlab.rec_drop_fields(newd, [datefield, timefield, 'Last_Processing_Date'])

    if keep_fields is not 'all':
        keep_fields = ['datetime'] + keep_fields
#        print keep_fields
        newd = mlab.rec_keep_fields(newd, keep_fields)
    if header:
        return newd, headerd
    else:
        return newd
Esempio n. 3
0
def qdt(r):
    """
    Small wrapper around rdt that removes duplicate names
    """

    rawFields = list(r.dtype.names)
    r = rdt(r)
    dtFields = list(r.dtype.names)
    dtFields = [f for f in dtFields if rawFields.count(f) == 0]
    return mlab.rec_keep_fields(r, dtFields)
Esempio n. 4
0
def interesting_out(opts,interesting,data):
    """
    Take a list of fields, and the recs
    output recs as csv to opts["out"], e.g. --out
    """
    header = True
    from matplotlib import mlab
    for d in data:
        cleaned = mlab.rec_keep_fields(d,interesting)
        mlab.rec2csv(cleaned,opts["out"],withheader=header)
        header=False
Esempio n. 5
0
def interesting_out(opts, interesting, data):
    """
    Take a list of fields, and the recs
    output recs as csv to opts["out"], e.g. --out
    """
    header = True
    from matplotlib import mlab
    for d in data:
        cleaned = mlab.rec_keep_fields(d, interesting)
        mlab.rec2csv(cleaned, opts["out"], withheader=header)
        header = False
Esempio n. 6
0
def atpy2h5(files, out, diff='all', name='ds'):
    """
   atpy format to h5

   Parameters
   ----------

   inp  : globable string specifying where the input files are
   out  : output h5 file.  In none exists, we create it.

   diff : List of fields that are stored as stacked arrays.  Those
          that are not different, we store the first element.
   """
    nfiles = len(files)
    t0 = atpy.Table(files[0])
    h5 = File(out)
    ds, ds1d = diffDS(t0.table_name,
                      t0.data.dtype, (nfiles, t0.data.size),
                      h5,
                      diff=diff)

    kicL = []
    nFail = 0
    #   import pdb;pdb.set_trace()
    for i in range(nfiles):
        if np.mod(i, 100) == 0:
            print i
        try:
            hdu = pyfits.open(files[i])
            data = hdu[1].data
            kic = hdu[1].header['KEPLERID']
            assert type(kic) == int
            kicL.append(kic)

            if diff != 'all':
                data = mlab.rec_keep_fields(data, diff)
                ds1d[:] = mlab.rec_drop_fields(data, diff)

            ds[i - nFail] = data

        except:
            print sys.exc_info()[1]
            nFail += 1

    ds.resize(ds.shape[0] - nFail, axis=0)
    kicL = np.array(kicL)
    h5.create_dataset('KIC', data=kicL)
    print "%i files failed" % nFail
    h5.close()
Esempio n. 7
0
def atpy2h5(files,out,diff='all',name='ds'):
   """
   atpy format to h5

   Parameters
   ----------

   inp  : globable string specifying where the input files are
   out  : output h5 file.  In none exists, we create it.

   diff : List of fields that are stored as stacked arrays.  Those
          that are not different, we store the first element.
   """
   nfiles = len(files)
   t0 = atpy.Table(files[0])
   h5 = File(out)
   ds,ds1d = diffDS(t0.table_name,t0.data.dtype,(nfiles,t0.data.size)
                    ,h5,diff=diff)
   
   kicL = []   
   nFail = 0 
#   import pdb;pdb.set_trace()
   for i in range(nfiles):
      if np.mod(i,100)==0:
         print i
      try:
         hdu = pyfits.open(files[i])
         data = hdu[1].data
         kic  = hdu[1].header['KEPLERID']
         assert type(kic) == int
         kicL.append(kic)
         
         if diff!='all':
            data = mlab.rec_keep_fields(data,diff)
            ds1d[:] =  mlab.rec_drop_fields(data,diff)

         ds[i-nFail] = data
   
      except:
         print sys.exc_info()[1]
         nFail +=1
         
   ds.resize(ds.shape[0]-nFail,axis=0)
   kicL = np.array(kicL)
   h5.create_dataset('KIC',data=kicL)
   print "%i files failed" % nFail
   h5.close()
Esempio n. 8
0
 def create_mag_table(self, outputPath, isocType="pdva", specType="basel"):
     """Create an HDF5 table of that describes a set of magnitudes."""
     if os.path.exists(outputPath): os.remove(outputPath)
     title = os.path.splitext(os.path.basename(outputPath))[0]
     h5file = tables.openFile(outputPath, mode="w", title=title)
     table = h5file.createTable("/", 'mags', MagTableDef, "Mag Model Table")
     print h5file
     docs = self.collection.find({"compute_complete":True,
         "np_data": {"$exists": 1}}) # , limit=2
     print "working on %i docs to read" % docs.count()
     lut = get_metallicity_LUT(isocType, specType)
     for doc in docs:
         print "reading", doc['_id']
         # print doc.keys()
         # print doc['np_data']
         npData = doc['np_data']
         # print npData.dtype
         # binData = Binary(doc['np_data']['data'])
         # print type(binData)
         # npData = pickle.load(binData)
         nRows = len(npData)
         # Append model information (about SFH, dust, etc)
         zmet = doc['pset']['zmet']
         Z = lut[zmet-1]
         zmets = np.ones(nRows, dtype=np.float) * Z
         tau = doc['pset']['tau']
         taus = np.ones(nRows, dtype=np.float) * tau
         npDataAll = mlab.rec_append_fields(npData, ['Z','tau'],[zmets,taus])
         # Trim the recarray to just the desired fields
         npDataTrim = mlab.rec_keep_fields(npDataAll,
             ['Z','tau','age','mass','lbol','sfr','TMASS_J','TMASS_H',
             'TMASS_Ks','MegaCam_u','MegaCam_g','MegaCam_r','MegaCam_i',
             'MegaCam_z','GALEX_NUV','GALEX_FUV'])
         for i in xrange(nRows):
             row = npDataTrim[i]
             print row['Z'], row['tau'],row['TMASS_J'],row['TMASS_Ks']
         # Append to HDF5
         table.append(npDataTrim)
     h5file.flush()
     h5file.close()
Esempio n. 9
0
def match_files(pattern, query):
    """
    Return list of file names of CSV files that satisfy query conditions, where
      pattern ... a file name pattern for files that are to be tested
      query ... a list of key=value pairs
    A file is a match if any row contains entries for all query conditions where
    each entry in the column labeled with key is identical to the corresponding
    value.
    Specify query as a dictionary in one of these forms:
      dict(k1=v1, k2=v2, k3=v3, ...)
      dict({"k1":v1, "k2":v2, "k3":v3, ...})
      {"k1":v1, "k2":v2, "k3":v3, ...}
    """
    # get all file names that match pattern
    infiles = glob.glob(pattern)
    infiles.sort()
    mlist = []
    # determine the query keys (in lowercase because csv2rec lowercases headers)
    # and query values
    qkeys = query.keys()
    qlckeys = [x.lower() for x in qkeys]
    qvalues = query.values()
    # check files for patterns
    for f in infiles:
        try:
            d = mlab.csv2rec(f, delimiter='\t')
        except ValueError:
            print(str(f) + " cannot be read by csv2rec")
        else:
            # check if the data contain the necessary columns
            if set(qlckeys) <= set(d.dtype.names):
                darray = mlab.rec_keep_fields(d, qlckeys)
                for row in darray:
                    if list(row) == qvalues:
                        # a match has been found
                        mlist.append(f)
                        break
    # return a list of file names for files with matches
    return mlist
def match_files(pattern, query):
    """
    Return list of file names of CSV files that satisfy query conditions, where
      pattern ... a file name pattern for files that are to be tested
      query ... a list of key=value pairs
    A file is a match if any row contains entries for all query conditions where
    each entry in the column labeled with key is identical to the corresponding
    value.
    Specify query as a dictionary in one of these forms:
      dict(k1=v1, k2=v2, k3=v3, ...)
      dict({"k1":v1, "k2":v2, "k3":v3, ...})
      {"k1":v1, "k2":v2, "k3":v3, ...}
    """
    # get all file names that match pattern
    infiles = glob.glob(pattern)
    infiles.sort()
    mlist = []
    # determine the query keys (in lowercase because csv2rec lowercases headers)
    # and query values
    qkeys = query.keys()
    qlckeys = [x.lower() for x in qkeys]
    qvalues = query.values()
    # check files for patterns
    for f in infiles:
        try:
            d = mlab.csv2rec(f, delimiter='\t')                
        except ValueError:
            print(str(f)+" cannot be read by csv2rec")
        else:
            # check if the data contain the necessary columns
            if set(qlckeys) <= set(d.dtype.names):
                darray = mlab.rec_keep_fields(d, qlckeys)
                for row in darray:
                    if list(row) == qvalues:
                        # a match has been found
                        mlist.append(f)
                        break
    # return a list of file names for files with matches
    return mlist
Esempio n. 11
0
    def run(self):
        # Convert the species and environment matrices to numpy rec arrays
        spp_ra = utilities.csv2rec(self.spp_file)
        env_ra = utilities.csv2rec(self.env_file)

        # Extract the plot IDs from both the species and environment matrices
        # and ensure that they are equal
        spp_plot_ids = getattr(spp_ra, self.id_field)
        env_plot_ids = getattr(env_ra, self.id_field)
        if not np.all(spp_plot_ids == env_plot_ids):
            err_msg = "Species and environment plot IDs do not match"
            raise ValueError(err_msg)

        # Drop the ID column from both arrays
        spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field])
        env_ra = mlab.rec_drop_fields(env_ra, [self.id_field])

        # For the environment matrix, only keep the variables specified
        env_ra = mlab.rec_keep_fields(env_ra, self.variables)

        # Convert these matrices to pure floating point arrays
        spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T
        env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T

        # Apply transformation if desired
        if self.species_transform == "SQRT":
            spp = np.sqrt(spp)
        elif self.species_transform == "LOG":
            spp = np.log(spp)

        # Create the RDA object
        cca = numpy_ordination.NumpyRDA(spp, env)

        # Open the output file
        numpy_fh = open(self.ord_file, "w")

        # Eigenvalues
        numpy_fh.write("### Eigenvalues ###\n")
        for (i, e) in enumerate(cca.eigenvalues):
            numpy_fh.write("RDA" + str(i + 1) + "," + "%.10f" % e + "\n")
        numpy_fh.write("\n")

        # Print out variable means
        numpy_fh.write("### Variable Means ###\n")
        for (i, m) in enumerate(cca.env_means):
            numpy_fh.write("%s,%.10f\n" % (self.variables[i], m))
        numpy_fh.write("\n")

        # Print out environmental coefficients loadings
        numpy_fh.write("### Coefficient Loadings ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("VARIABLE," + header_str + "\n")
        for (i, c) in enumerate(cca.coefficients()):
            coeff = ",".join(["%.10f" % x for x in c])
            numpy_fh.write("%s,%s\n" % (self.variables[i], coeff))
        numpy_fh.write("\n")

        # Print out biplot scores
        numpy_fh.write("### Biplot Scores ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("VARIABLE," + header_str + "\n")
        for (i, b) in enumerate(cca.biplot_scores()):
            scores = ",".join(["%.10f" % x for x in b])
            numpy_fh.write("%s,%s\n" % (self.variables[i], scores))
        numpy_fh.write("\n")

        # Print out species centroids
        numpy_fh.write("### Species Centroids ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("SPECIES," + header_str + "\n")
        for (i, c) in enumerate(cca.species_centroids()):
            scores = ",".join(["%.10f" % x for x in c])
            numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores))
        numpy_fh.write("\n")

        # Print out species tolerances
        numpy_fh.write("### Species Tolerances ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("SPECIES," + header_str + "\n")
        for (i, t) in enumerate(cca.species_tolerances()):
            scores = ",".join(["%.21f" % x for x in t])
            numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores))
        numpy_fh.write("\n")

        # Print out miscellaneous species information
        numpy_fh.write("### Miscellaneous Species Information ###\n")
        numpy_fh.write("SPECIES,WEIGHT,N2\n")
        species_weights, species_n2 = cca.species_information()
        for i in xrange(len(species_weights)):
            numpy_fh.write("%s,%.10f,%.10f\n" % (spp_ra.dtype.names[i], species_weights[i], species_n2[i]))
        numpy_fh.write("\n")

        # Print out site LC scores
        numpy_fh.write("### Site LC Scores ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("ID," + header_str + "\n")
        for (i, s) in enumerate(cca.site_lc_scores()):
            scores = ",".join(["%.10f" % x for x in s])
            numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores))
        numpy_fh.write("\n")

        # Print out site WA scores
        numpy_fh.write("### Site WA Scores ###\n")
        header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write("ID," + header_str + "\n")
        for (i, s) in enumerate(cca.site_wa_scores()):
            scores = ",".join(["%.10f" % x for x in s])
            numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores))
        numpy_fh.write("\n")

        # Miscellaneous site information
        numpy_fh.write("### Miscellaneous Site Information ###\n")
        numpy_fh.write("ID,WEIGHT,N2\n")
        site_weights, site_n2 = cca.site_information()
        for i in xrange(len(site_weights)):
            numpy_fh.write("%s,%.10f,%.10f\n" % (spp_plot_ids[i], site_weights[i], site_n2[i]))

        # Close the file
        numpy_fh.close()
    def run_diagnostic(self):

        # Shortcut to the parameter parser
        p = self.parameter_parser

        # ID field
        id_field = p.summary_level + 'ID'

        # Root directory for Riemann files
        root_dir = p.riemann_output_folder

        # Read in hex input file
        obs_data = utilities.csv2rec(self.hex_attribute_file)

        # Get the hexagon levels and ensure that the fields exist in the
        # hex_attribute file
        hex_resolutions = p.riemann_hex_resolutions
        hex_fields = [x[0] for x in hex_resolutions]
        for field in hex_fields:
            if field not in obs_data.dtype.names:
                err_msg = 'Field ' + field + ' does not exist in the '
                err_msg += 'hex_attribute file'
                raise ValueError(err_msg)

        # Create the directory structure based on the hex levels
        hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions]
        all_levels = ['plot_pixel'] + hex_levels
        for level in all_levels:
            sub_dir = os.path.join(root_dir, level)
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)

        # Get the values of k
        k_values = p.riemann_k_values

        # Create a dictionary of plot ID to image year (or model_year for
        # non-imagery models) for these plots
        if p.model_type in p.imagery_model_types:
            id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data)
        else:
            id_x_year = dict((x[id_field], p.model_year) for x in obs_data)

        # Create a PredictionRun instance
        pr = prediction_run.PredictionRun(p)

        # Get the neighbors and distances for these IDs
        pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field)

        # Create the lookup of id_field to LOC_ID for the hex plots
        nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data)

        # Create a dictionary between id_field and no_self_assign_field
        # for the model plots
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)
        model_nsa_id_dict = dict((getattr(x, id_field), x.LOC_ID)
            for x in env_data)

        # Stitch the two dictionaries together
        for id in sorted(model_nsa_id_dict.keys()):
            if id not in nsa_id_dict:
                nsa_id_dict[id] = model_nsa_id_dict[id]

        # Get the stand attribute metadata and retrieve only the
        # continuous accuracy attributes
        stand_metadata_file = p.stand_metadata_file
        mp = xsmp.XMLStandMetadataParser(stand_metadata_file)
        attrs = [x.field_name for x in mp.attributes
            if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1]

        # Subset the attributes for fields that are in the
        # hex_attribute file
        attrs = [x for x in attrs if x in obs_data.dtype.names]
        plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs)

        # Write out the plot_pixel observed file
        file_name = 'plot_pixel_observed.csv'
        output_file = os.path.join(root_dir, 'plot_pixel', file_name)
        utilities.rec2csv(plot_pixel_obs, output_file)

        # Iterate over values of k
        for k in k_values:

            # Construct the output file name
            file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k)))
            file_name += '.csv'
            output_file = os.path.join(root_dir, 'plot_pixel', file_name)
            out_fh = open(output_file, 'w')

            # For the plot/pixel scale, retrieve the independent predicted
            # data for this value of k.  Even though attributes are being
            # returned from this function, we want to use the attribute list
            # that we've already found above.
            prediction_generator = pr.calculate_predictions_at_k(
                k=k, id_field=id_field, independent=True,
                nsa_id_dict=nsa_id_dict)

            # Write out the field names
            out_fh.write(id_field + ',' + ','.join(attrs) + '\n')

            # Write out the predictions for this k
            for plot_prediction in prediction_generator:

                # Write this record to the predicted attribute file
                pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs)

            # Close this file
            out_fh.close()

        # Create the fields for which to extract statistics at the hexagon
        # levels
        mean_fields = [(id_field, len, 'PLOT_COUNT')]
        mean_fields.extend([(x, np.mean, x) for x in attrs])
        mean_fields = tuple(mean_fields)

        sd_fields = [(id_field, len, 'PLOT_COUNT')]
        sd_fields.extend([(x, np.std, x) for x in attrs])
        sd_fields = tuple(sd_fields)

        stat_sets = {
            'mean': mean_fields,
            'std': sd_fields,
        }

        # For each hexagon level, associate the plots with their hexagon ID
        # and find observed and predicted statistics for each hexagon
        for hex_resolution in hex_resolutions:

            (hex_id_field, hex_distance) = hex_resolution[0:2]
            min_plots_per_hex = hex_resolution[3]
            prefix = 'hex_' + str(hex_distance)

            # Create a crosswalk between the id_field and the hex_id_field
            id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field])

            # Iterate over all sets of statistics and write a unique file
            # for each set
            for (stat_name, stat_fields) in stat_sets.iteritems():

                # Get the output file name
                obs_out_file = \
                    '_'.join((prefix, 'observed', stat_name)) + '.csv'
                obs_out_file = os.path.join(root_dir, prefix, obs_out_file)

                # Write out the observed file
                self.write_hex_stats(obs_data, hex_id_field, stat_fields,
                    min_plots_per_hex, obs_out_file)

            # Iterate over values of k for the predicted values
            for k in k_values:

                # Open the plot_pixel predicted file for this value of k
                # and join the hex_id_field to the recarray
                prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
                prd_file = os.path.join(root_dir, 'plot_pixel', prd_file)
                prd_data = utilities.csv2rec(prd_file)
                prd_data = mlab.rec_join(id_field, prd_data, id_x_hex)

                # Iterate over all sets of statistics and write a unique file
                # for each set
                for (stat_name, stat_fields) in stat_sets.iteritems():

                    # Get the output file name
                    prd_out_file = '_'.join((
                        prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv'
                    prd_out_file = os.path.join(root_dir, prefix, prd_out_file)

                    # Write out the predicted file
                    self.write_hex_stats(prd_data, hex_id_field, stat_fields,
                        min_plots_per_hex, prd_out_file)

        # Calculate the ECDF and AC statistics
        # For ECDF and AC, it is a paired comparison between the observed
        # and predicted data.  We do this at each value of k and for each
        # hex resolution level.

        # Open the stats file
        stats_file = p.hex_statistics_file
        stats_fh = open(stats_file, 'w')
        header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE']
        stats_fh.write(','.join(header_fields) + '\n')

        # Create a list of RiemannComparison instances which store the
        # information needed to do comparisons between observed and predicted
        # files for any level or value of k
        compare_list = []
        for hex_resolution in hex_resolutions:
            (hex_id_field, hex_distance) = hex_resolution[0:2]
            prefix = 'hex_' + str(hex_distance)
            obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv'
            obs_file = os.path.join(root_dir, prefix, obs_file)
            for k in k_values:
                prd_file = '_'.join((
                    prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv'
                prd_file = os.path.join(root_dir, prefix, prd_file)
                r = RiemannComparison(
                    prefix, obs_file, prd_file, hex_id_field, k)
                compare_list.append(r)

        # Add the plot_pixel comparisons to this list
        prefix = 'plot_pixel'
        obs_file = 'plot_pixel_observed.csv'
        obs_file = os.path.join(root_dir, prefix, obs_file)
        for k in k_values:
            prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
            prd_file = os.path.join(root_dir, prefix, prd_file)
            r = RiemannComparison(prefix, obs_file, prd_file, id_field, k)
            compare_list.append(r)

        # Do all the comparisons
        for c in compare_list:

            # Open the observed file
            obs_data = utilities.csv2rec(c.obs_file)

            # Open the predicted file
            prd_data = utilities.csv2rec(c.prd_file)

            # Ensure that the IDs between the observed and predicted
            # data line up
            ids1 = getattr(obs_data, c.id_field)
            ids2 = getattr(prd_data, c.id_field)
            if np.all(ids1 != ids2):
                err_msg = 'IDs do not match between observed and '
                err_msg += 'predicted data'
                raise ValueError(err_msg)

            for attr in attrs:
                arr1 = getattr(obs_data, attr)
                arr2 = getattr(prd_data, attr)
                rv = RiemannVariable(arr1, arr2)

                gmfr_stats = rv.gmfr_statistics()
                for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                        attr, stat.upper(), gmfr_stats[stat])
                    stats_fh.write(stat_line)

                ks_stats = rv.ks_statistics()
                for stat in ('ks_max', 'ks_mean'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                        attr, stat.upper(), ks_stats[stat])
                    stats_fh.write(stat_line)
Esempio n. 13
0
    def run_diagnostic(self):

        # Shortcut to the parameter parser
        p = self.parameter_parser

        # ID field
        id_field = p.summary_level + 'ID'

        # Root directory for Riemann files
        root_dir = p.riemann_output_folder

        # Read in hex input file
        obs_data = utilities.csv2rec(self.hex_attribute_file)

        # Get the hexagon levels and ensure that the fields exist in the
        # hex_attribute file
        hex_resolutions = p.riemann_hex_resolutions
        hex_fields = [x[0] for x in hex_resolutions]
        for field in hex_fields:
            if field not in obs_data.dtype.names:
                err_msg = 'Field ' + field + ' does not exist in the '
                err_msg += 'hex_attribute file'
                raise ValueError(err_msg)

        # Create the directory structure based on the hex levels
        hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions]
        all_levels = ['plot_pixel'] + hex_levels
        for level in all_levels:
            sub_dir = os.path.join(root_dir, level)
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)

        # Get the values of k
        k_values = p.riemann_k_values

        # Create a dictionary of plot ID to image year (or model_year for
        # non-imagery models) for these plots
        if p.model_type in p.imagery_model_types:
            id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data)
        else:
            id_x_year = dict((x[id_field], p.model_year) for x in obs_data)

        # Create a PredictionRun instance
        pr = prediction_run.PredictionRun(p)

        # Get the neighbors and distances for these IDs
        pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field)

        # Create the lookup of id_field to LOC_ID for the hex plots
        nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data)

        # Create a dictionary between id_field and no_self_assign_field
        # for the model plots
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)
        model_nsa_id_dict = dict(
            (getattr(x, id_field), x.LOC_ID) for x in env_data)

        # Stitch the two dictionaries together
        for id in sorted(model_nsa_id_dict.keys()):
            if id not in nsa_id_dict:
                nsa_id_dict[id] = model_nsa_id_dict[id]

        # Get the stand attribute metadata and retrieve only the
        # continuous accuracy attributes
        stand_metadata_file = p.stand_metadata_file
        mp = xsmp.XMLStandMetadataParser(stand_metadata_file)
        attrs = [
            x.field_name for x in mp.attributes
            if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1
        ]

        # Subset the attributes for fields that are in the
        # hex_attribute file
        attrs = [x for x in attrs if x in obs_data.dtype.names]
        plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs)

        # Write out the plot_pixel observed file
        file_name = 'plot_pixel_observed.csv'
        output_file = os.path.join(root_dir, 'plot_pixel', file_name)
        utilities.rec2csv(plot_pixel_obs, output_file)

        # Iterate over values of k
        for k in k_values:

            # Construct the output file name
            file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k)))
            file_name += '.csv'
            output_file = os.path.join(root_dir, 'plot_pixel', file_name)
            out_fh = open(output_file, 'w')

            # For the plot/pixel scale, retrieve the independent predicted
            # data for this value of k.  Even though attributes are being
            # returned from this function, we want to use the attribute list
            # that we've already found above.
            prediction_generator = pr.calculate_predictions_at_k(
                k=k,
                id_field=id_field,
                independent=True,
                nsa_id_dict=nsa_id_dict)

            # Write out the field names
            out_fh.write(id_field + ',' + ','.join(attrs) + '\n')

            # Write out the predictions for this k
            for plot_prediction in prediction_generator:

                # Write this record to the predicted attribute file
                pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs)

            # Close this file
            out_fh.close()

        # Create the fields for which to extract statistics at the hexagon
        # levels
        mean_fields = [(id_field, len, 'PLOT_COUNT')]
        mean_fields.extend([(x, np.mean, x) for x in attrs])
        mean_fields = tuple(mean_fields)

        sd_fields = [(id_field, len, 'PLOT_COUNT')]
        sd_fields.extend([(x, np.std, x) for x in attrs])
        sd_fields = tuple(sd_fields)

        stat_sets = {
            'mean': mean_fields,
            'std': sd_fields,
        }

        # For each hexagon level, associate the plots with their hexagon ID
        # and find observed and predicted statistics for each hexagon
        for hex_resolution in hex_resolutions:

            (hex_id_field, hex_distance) = hex_resolution[0:2]
            min_plots_per_hex = hex_resolution[3]
            prefix = 'hex_' + str(hex_distance)

            # Create a crosswalk between the id_field and the hex_id_field
            id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field])

            # Iterate over all sets of statistics and write a unique file
            # for each set
            for (stat_name, stat_fields) in stat_sets.iteritems():

                # Get the output file name
                obs_out_file = \
                    '_'.join((prefix, 'observed', stat_name)) + '.csv'
                obs_out_file = os.path.join(root_dir, prefix, obs_out_file)

                # Write out the observed file
                self.write_hex_stats(obs_data, hex_id_field, stat_fields,
                                     min_plots_per_hex, obs_out_file)

            # Iterate over values of k for the predicted values
            for k in k_values:

                # Open the plot_pixel predicted file for this value of k
                # and join the hex_id_field to the recarray
                prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
                prd_file = os.path.join(root_dir, 'plot_pixel', prd_file)
                prd_data = utilities.csv2rec(prd_file)
                prd_data = mlab.rec_join(id_field, prd_data, id_x_hex)

                # Iterate over all sets of statistics and write a unique file
                # for each set
                for (stat_name, stat_fields) in stat_sets.iteritems():

                    # Get the output file name
                    prd_out_file = '_'.join((prefix, 'predicted', 'k' + str(k),
                                             stat_name)) + '.csv'
                    prd_out_file = os.path.join(root_dir, prefix, prd_out_file)

                    # Write out the predicted file
                    self.write_hex_stats(prd_data, hex_id_field, stat_fields,
                                         min_plots_per_hex, prd_out_file)

        # Calculate the ECDF and AC statistics
        # For ECDF and AC, it is a paired comparison between the observed
        # and predicted data.  We do this at each value of k and for each
        # hex resolution level.

        # Open the stats file
        stats_file = p.hex_statistics_file
        stats_fh = open(stats_file, 'w')
        header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE']
        stats_fh.write(','.join(header_fields) + '\n')

        # Create a list of RiemannComparison instances which store the
        # information needed to do comparisons between observed and predicted
        # files for any level or value of k
        compare_list = []
        for hex_resolution in hex_resolutions:
            (hex_id_field, hex_distance) = hex_resolution[0:2]
            prefix = 'hex_' + str(hex_distance)
            obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv'
            obs_file = os.path.join(root_dir, prefix, obs_file)
            for k in k_values:
                prd_file = '_'.join(
                    (prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv'
                prd_file = os.path.join(root_dir, prefix, prd_file)
                r = RiemannComparison(prefix, obs_file, prd_file, hex_id_field,
                                      k)
                compare_list.append(r)

        # Add the plot_pixel comparisons to this list
        prefix = 'plot_pixel'
        obs_file = 'plot_pixel_observed.csv'
        obs_file = os.path.join(root_dir, prefix, obs_file)
        for k in k_values:
            prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
            prd_file = os.path.join(root_dir, prefix, prd_file)
            r = RiemannComparison(prefix, obs_file, prd_file, id_field, k)
            compare_list.append(r)

        # Do all the comparisons
        for c in compare_list:

            # Open the observed file
            obs_data = utilities.csv2rec(c.obs_file)

            # Open the predicted file
            prd_data = utilities.csv2rec(c.prd_file)

            # Ensure that the IDs between the observed and predicted
            # data line up
            ids1 = getattr(obs_data, c.id_field)
            ids2 = getattr(prd_data, c.id_field)
            if np.all(ids1 != ids2):
                err_msg = 'IDs do not match between observed and '
                err_msg += 'predicted data'
                raise ValueError(err_msg)

            for attr in attrs:
                arr1 = getattr(obs_data, attr)
                arr2 = getattr(prd_data, attr)
                rv = RiemannVariable(arr1, arr2)

                gmfr_stats = rv.gmfr_statistics()
                for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                                                        attr, stat.upper(),
                                                        gmfr_stats[stat])
                    stats_fh.write(stat_line)

                ks_stats = rv.ks_statistics()
                for stat in ('ks_max', 'ks_mean'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                                                        attr, stat.upper(),
                                                        ks_stats[stat])
                    stats_fh.write(stat_line)
Esempio n. 14
0
def extract_data(pattern, query, headers, ctypes=None, fname=''):
    """
    Extract data from CSV files whose name matches pattern.
    Every record in a given file is checked if it satisfies the query
    condition(s). If the query condition(s) are satisfied, data from the
    columns specified by headers are extracted from that record. Collected
    records are returned in a numpy record array and, if a filename fname is
    specified, they are also written to fn in tab-separated CSV format.
    If no matching records are found an empty record array of type bool is
    returned.
    argument:       comment:
    pattern         a file name pattern for files from which records are to be
                    extracted
    query           conditions in the form of a dictionary (list of key-value
                    pairs) that need to be fulfilled for a record to be
                    extracted;
                    the query dictionary is specified in one of these forms:
                      dict(k1=v1, k2=v2, k3=v3, ...)
                      dict({"k1":v1, "k2":v2, "k3":v3, ...})
                      {"k1":v1, "k2":v2, "k3":v3, ...}
    headers         a list of strings specifying the column headers for the
                    columns which are to be extracted
    ctypes          if not None, is a dictionary mapping column number or
                    munged column name to a converter function;
                    the column type converter dictionary can be specified as:
                      {"k1":t1, "k2":t2, "k3":t3, ...}
                    where the t can be, e.g., str, int, float, bool.
    fname           if defined, the name of the CSV file (tab-separated) to
                    which extracted records are written
    """

    # get all file names that match pattern
    infiles = glob.glob(pattern)
    infiles.sort()

    # determine the query and header keys (in lowercase because csv2rec
    # lowercases headers), and query values
    qkeys = query.keys()
    qlckeys = [x.lower() for x in qkeys]
    qvalues = query.values()
    hlckeys = [x.lower() for x in headers]
    if ctypes:
        ctypes_lc = dict(
            (key.lower(), value) for (key, value) in ctypes.items())
    else:
        ctypes_lc = None
    mkeys = set(qlckeys)
    mkeys = mkeys.union(hlckeys)
    mrows = []

    # check files for query patterns
    for f in infiles:
        d = mlab.csv2rec(f, delimiter='\t', converterd=ctypes_lc)
        # check if the data contain the necessary columns
        if mkeys <= set(d.dtype.names):
            # find the records that match the query
            darray = mlab.rec_keep_fields(d, qlckeys)
            imatch = np.array([False] * darray.size)
            for i in range(darray.size):
                if list(darray[i]) == qvalues:
                    imatch[i] = True
            # get data from records that matched the query
            if any(imatch):
                marray = mlab.rec_keep_fields(d, hlckeys)[imatch]
                for row in marray:
                    mrows.append(row.tolist())

    # write data from matching records to file if requested and return results
    if mrows:
        # The following does not work because the mlab.csv2rec() converterd
        # data type specifications are different from the
        # np.core.records.fromrecords() dtype data type specifications ...
        #results = np.core.records.fromrecords(mrows, dtype=ctypes_lc)
        # ... so, for now we cross our fingers and hope that
        # np.core.records.fromrecords() intuits the data types correctly, which
        # it seems to do (most of the time)
        results = np.core.records.fromrecords(mrows, names=headers)
    else:
        dt = [(h, bool) for h in headers]
        results = np.recarray(0, dtype=dt)
    if fname != '':
        mlab.rec2csv(results, fname, delimiter='\t')
    return results
def write_baseline_file(recarray, track):
    ''' write a simple ascii file with date and baseline columns '''
    subset = mlab.rec_keep_fields(recarray, ['roidate','bperp'])
    mlab.rec2csv(subset,'baselines.txt'.format(track), withheader=False, delimiter=' ')
Esempio n. 16
0
    def run(self):
        # Convert the species and environment matrices to numpy rec arrays
        spp_ra = utilities.csv2rec(self.spp_file)
        env_ra = utilities.csv2rec(self.env_file)

        # Extract the plot IDs from both the species and environment matrices
        # and ensure that they are equal
        spp_plot_ids = getattr(spp_ra, self.id_field)
        env_plot_ids = getattr(env_ra, self.id_field)
        if not np.all(spp_plot_ids == env_plot_ids):
            err_msg = 'Species and environment plot IDs do not match'
            raise ValueError(err_msg)

        # Drop the ID column from both arrays
        spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field])
        env_ra = mlab.rec_drop_fields(env_ra, [self.id_field])

        # For the environment matrix, only keep the variables specified
        env_ra = mlab.rec_keep_fields(env_ra, self.variables)

        # Convert these matrices to pure floating point arrays
        spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T
        env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T

        # Apply transformation if desired
        if self.species_transform == 'SQRT':
            spp = np.sqrt(spp)
        elif self.species_transform == 'LOG':
            spp = np.log(spp)

        # Create the RDA object
        cca = numpy_ordination.NumpyRDA(spp, env)

        # Open the output file
        numpy_fh = open(self.ord_file, 'w')

        # Eigenvalues
        numpy_fh.write('### Eigenvalues ###\n')
        for (i, e) in enumerate(cca.eigenvalues):
            numpy_fh.write('RDA' + str(i + 1) + ',' + '%.10f' % e + '\n')
        numpy_fh.write('\n')

        # Print out variable means
        numpy_fh.write('### Variable Means ###\n')
        for (i, m) in enumerate(cca.env_means):
            numpy_fh.write('%s,%.10f\n' % (self.variables[i], m))
        numpy_fh.write('\n')

        # Print out environmental coefficients loadings
        numpy_fh.write('### Coefficient Loadings ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('VARIABLE,' + header_str + '\n')
        for (i, c) in enumerate(cca.coefficients()):
            coeff = ','.join(['%.10f' % x for x in c])
            numpy_fh.write('%s,%s\n' % (self.variables[i], coeff))
        numpy_fh.write('\n')

        # Print out biplot scores
        numpy_fh.write('### Biplot Scores ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('VARIABLE,' + header_str + '\n')
        for (i, b) in enumerate(cca.biplot_scores()):
            scores = ','.join(['%.10f' % x for x in b])
            numpy_fh.write('%s,%s\n' % (self.variables[i], scores))
        numpy_fh.write('\n')

        # Print out species centroids
        numpy_fh.write('### Species Centroids ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('SPECIES,' + header_str + '\n')
        for (i, c) in enumerate(cca.species_centroids()):
            scores = ','.join(['%.10f' % x for x in c])
            numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores))
        numpy_fh.write('\n')

        # Print out species tolerances
        numpy_fh.write('### Species Tolerances ###\n')
        header_str = \
            ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('SPECIES,' + header_str + '\n')
        for (i, t) in enumerate(cca.species_tolerances()):
            scores = ','.join(['%.21f' % x for x in t])
            numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores))
        numpy_fh.write('\n')

        # Print out miscellaneous species information
        numpy_fh.write('### Miscellaneous Species Information ###\n')
        numpy_fh.write('SPECIES,WEIGHT,N2\n')
        species_weights, species_n2 = cca.species_information()
        for i in xrange(len(species_weights)):
            numpy_fh.write(
                '%s,%.10f,%.10f\n' %
                (spp_ra.dtype.names[i], species_weights[i], species_n2[i]))
        numpy_fh.write('\n')

        # Print out site LC scores
        numpy_fh.write('### Site LC Scores ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('ID,' + header_str + '\n')
        for (i, s) in enumerate(cca.site_lc_scores()):
            scores = ','.join(['%.10f' % x for x in s])
            numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores))
        numpy_fh.write('\n')

        # Print out site WA scores
        numpy_fh.write('### Site WA Scores ###\n')
        header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)])
        numpy_fh.write('ID,' + header_str + '\n')
        for (i, s) in enumerate(cca.site_wa_scores()):
            scores = ','.join(['%.10f' % x for x in s])
            numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores))
        numpy_fh.write('\n')

        # Miscellaneous site information
        numpy_fh.write('### Miscellaneous Site Information ###\n')
        numpy_fh.write('ID,WEIGHT,N2\n')
        site_weights, site_n2 = cca.site_information()
        for i in xrange(len(site_weights)):
            numpy_fh.write('%s,%.10f,%.10f\n' %
                           (spp_plot_ids[i], site_weights[i], site_n2[i]))

        # Close the file
        numpy_fh.close()
def extract_data(pattern, query, headers, ctypes=None, fname=''):
    """
    Extract data from CSV files whose name matches pattern.
    Every record in a given file is checked if it satisfies the query
    condition(s). If the query condition(s) are satisfied, data from the
    columns specified by headers are extracted from that record. Collected
    records are returned in a numpy record array and, if a filename fname is
    specified, they are also written to fn in tab-separated CSV format.
    If no matching records are found an empty record array of type bool is
    returned.
    argument:       comment:
    pattern         a file name pattern for files from which records are to be
                    extracted
    query           conditions in the form of a dictionary (list of key-value
                    pairs) that need to be fulfilled for a record to be
                    extracted;
                    the query dictionary is specified in one of these forms:
                      dict(k1=v1, k2=v2, k3=v3, ...)
                      dict({"k1":v1, "k2":v2, "k3":v3, ...})
                      {"k1":v1, "k2":v2, "k3":v3, ...}
    headers         a list of strings specifying the column headers for the
                    columns which are to be extracted
    ctypes          if not None, is a dictionary mapping column number or
                    munged column name to a converter function;
                    the column type converter dictionary can be specified as:
                      {"k1":t1, "k2":t2, "k3":t3, ...}
                    where the t can be, e.g., str, int, float, bool.
    fname           if defined, the name of the CSV file (tab-separated) to
                    which extracted records are written
    """

    # get all file names that match pattern
    infiles = glob.glob(pattern)
    infiles.sort()

    # determine the query and header keys (in lowercase because csv2rec
    # lowercases headers), and query values
    qkeys = query.keys()
    qlckeys = [x.lower() for x in qkeys]
    qvalues = query.values()
    hlckeys = [x.lower() for x in headers]
    if ctypes:
        ctypes_lc = dict((key.lower(), value) for (key, value) in ctypes.items())
    else:
        ctypes_lc = None
    mkeys = set(qlckeys)
    mkeys = mkeys.union(hlckeys)
    mrows = []

    # check files for query patterns
    for f in infiles:
        d = mlab.csv2rec(f, delimiter='\t', converterd=ctypes_lc)
        # check if the data contain the necessary columns
        if mkeys <= set(d.dtype.names):
            # find the records that match the query
            darray = mlab.rec_keep_fields(d, qlckeys)
            imatch = np.array([False]*darray.size)
            for i in range(darray.size):
                if list(darray[i]) == qvalues:
                    imatch[i] = True
            # get data from records that matched the query
            if any(imatch):
                marray = mlab.rec_keep_fields(d, hlckeys)[imatch]
                for row in marray:
                    mrows.append(row.tolist())

    # write data from matching records to file if requested and return results
    if mrows:
        # The following does not work because the mlab.csv2rec() converterd
        # data type specifications are different from the
        # np.core.records.fromrecords() dtype data type specifications ...
        #results = np.core.records.fromrecords(mrows, dtype=ctypes_lc)
        # ... so, for now we cross our fingers and hope that
        # np.core.records.fromrecords() intuits the data types correctly, which
        # it seems to do (most of the time)
        results = np.core.records.fromrecords(mrows, names=headers)
    else:
        dt = [(h, bool) for h in headers]
        results = np.recarray(0, dtype=dt)
    if fname != '':
        mlab.rec2csv(results, fname, delimiter='\t')
    return results