Esempio n. 1
0
def build_masksample_indices(period, timeframe, mask, min_valid=1):
    '''
    Build resample indices for the supplied timeframe with a validity threshold
    '''
    tc = period_to_tc(period)
    tf = dt.validate_timeframe(timeframe)

    new_p = dt.resample_dti(period, tf, as_period=True)

    _, end_of_p, _ = dt.boundary_funcs(tf)

    s_mask = mask.copy()
    sample_indices = []

    cur_s = 0
    cur_e = 0

    for p in new_p:
        cur_idx = tc.get_index(slice(p.start_time, end_of_p(p.start_time)))
        v_count = (~mask[cur_idx]).sum()
        if (v_count >= min_valid):
            cur_e += v_count
            sample_indices.append(slice(cur_s, cur_e))
            cur_s = cur_e
        else:
            s_mask[cur_idx] = True

    return sample_indices, np.where(s_mask == False)[0]
Esempio n. 2
0
    def stat_percentiles(self, statistic='pearsons_r', freq='m', pctiles=None):
        '''
        Print a summary of percentiles for the specified statistic and timeframe
        '''
        if pctiles is None:
            pctiles = [0, 5, 25, 50, 75, 95, 100]
        tf = dt.validate_timeframe(freq).lower()
        df = pd.DataFrame()

        for m in self._iter_models(freq):
            if statistic == "grand_f":
                m_data = m.stats[tf].loc['fobj', m.stats[tf].columns != 'all']
                try:
                    stats = standard_percentiles(m_data)
                    df[m.name] = pd.Series(
                        index=['grand_f'],
                        data=[(stats['25%'] + stats['50%'] + stats['75%'] +
                               stats['100%']) / 4])
                except IndexError:
                    logger.warning("no stats for model: %s", m.name)
            else:
                m_data = m.stats[tf].loc[statistic,
                                         m.stats[tf].columns != 'all']
                try:
                    df[m.name] = standard_percentiles(m_data, pctiles)
                except IndexError:
                    logger.warning("no stats for model: %s", m.name)

        return df.transpose()
Esempio n. 3
0
def build_resample_index(period, timeframe, window=None):
    '''
    Return (slice/integer) indices matching the boundaries of a resampled period
    Optionally supply a window period (ie only produces indices within the window)
    '''
    tc = period_to_tc(period)
    tf = dt.validate_timeframe(timeframe)

    if window is None:
        window = period

    new_p = dt.resample_dti(window, tf, as_period=True)

    _, end_of_p, _ = dt.boundary_funcs(tf)

    if isinstance(period, pd.PeriodIndex):

        def enforce_freq(ts):
            return ts.to_period(period.freq)
    else:

        def enforce_freq(ts):
            return ts

    indices = []
    for p in new_p:
        s = enforce_freq(p.start_time)
        e = enforce_freq(end_of_p(p.start_time))
        indices.append(tc.get_index(slice(s, e)))

    return indices
Esempio n. 4
0
    def data_percentiles(self, freq='m', pctiles=None):
        '''
        Print a summary of percentiles for the actual data values
        '''
        if pctiles is None:
            pctiles = [0, 5, 25, 50, 75, 95, 100]
        tf = dt.validate_timeframe(freq).lower()
        df = pd.DataFrame()

        if freq == 'd':  # obs won't match model.obs since different obs.valid_idx for each model
            obs_series = self.obs.data.mean().values.flatten()
        else:
            pd_tf = dt.pandas_tf_dict[tf]
            obs_series = self.obs.data.resample(
                rule=pd_tf, how=self.aggr_how).mean().values.flatten()

        df[self.ref_name] = standard_percentiles(obs_series, pctiles)

        for m in self._iter_models(freq):
            m_data = pd.DataFrame.from_dict(m.data[tf]).mean().values.flatten()
            try:
                df[m.name] = standard_percentiles(m_data, pctiles)
            except IndexError:
                logger.warning("no stats for model: %s", m.name)

        return df.transpose()
Esempio n. 5
0
    def plot_box(self, statistic, freq='m', **kwargs):
        '''
        Show a box-plot for the specified statistic and timeframe
        '''
        tf = dt.validate_timeframe(freq).lower()

        ax = self._get_ax(kwargs)

        data = []
        colours = []
        names = []

        for m in self._iter_models(freq):
            data.append(m.stats[tf].loc[statistic,
                                        m.stats[tf].columns != 'all'])
            colours.append(m.colour)
            names.append(m.name)

        box = ax.boxplot(data, patch_artist=True)

        ax.set_ylabel(statistic)

        for patch, colour in zip(box['boxes'], colours):
            patch.set_facecolor(colour)

        ax.set_xticklabels(names, rotation=90, fontsize=8)

        for k, v in kwargs.items():
            try:
                ax.set(**{k: v})
            except:
                pass
        ax.grid()
        return ax, box
Esempio n. 6
0
    def stat(self, statistic='mean', freq='m'):
        tf = dt.validate_timeframe(freq).lower()
        df = pd.DataFrame()

        for m in self._iter_models(freq):
            df[m.name] = m.stats[tf].loc[statistic]
        if statistic == 'mean':
            df[self.ref_name] = m.stats[tf].loc['obs_mean']

        return df
Esempio n. 7
0
    def plot_regression(self,
                        site=None,
                        freq='m',
                        title="",
                        size=20,
                        **kwargs):
        '''
        Plot the model regression(s) for the specified site and frequency
        '''
        if site is None:
            site = list(self.obs.data.columns)
            stats_index = 'all'
        else:
            stats_index = site
            site = [site]

        tf = dt.validate_timeframe(freq).lower()

        ax = self._get_ax(kwargs)

        for m in self._iter_models(freq):
            _site_list = []
            for _site in site:
                if _site in m.data[tf].keys():
                    _site_list.append(_site)

            model_data = pd.DataFrame.from_dict(m.data[tf])[_site_list]
            obs_data = pd.DataFrame.from_dict(m.obs[tf])[_site_list]
            ax.scatter(obs_data, model_data, color=m.colour, s=size)

        ax.set_ylabel('model ' + self.var_name)
        ax.set_xlabel(str(self.ref_name))
        if isinstance(site, list):
            ax.set_title(title)
        else:
            ax.set_title(title + " %s" % site)

        ax.set(**kwargs)
        ax.grid()

        # plot regression lines and 1:1 line
        rl = get_ax_limit(ax)

        for m in self._iter_models(freq):
            try:
                mstats = m.stats[tf][stats_index]
            except KeyError:
                continue
            regress_line = mstats.loc[
                'r_intercept'] + rl * mstats.loc['r_slope']
            ax.plot(rl, regress_line, color=m.colour, label=m.name)
        ax.plot(rl, rl, linestyle='--', color='black', label='1:1')

        ax.legend(loc='best')
        return ax
Esempio n. 8
0
    def plot_timeseries(self,site,freq='m',model=None,**kwargs):
        '''
        Plot timeseries of data at the specified site and frequency
        '''
        from functools import partial

        ax = self._get_ax(kwargs)

        # def _plot(ax,series,label,colour):
        def _plot(series,label,colour):
            #+++ fix for pandas 0.16.1 legend label bug (see https://github.com/pydata/pandas/issues/10119)
            series.name = label
            # series.plot(legend=True,axes=ax,color=colour)
            series.plot(legend=True,color=colour)
        # plot = partial(_plot,ax=ax)
        plot = partial(_plot)

        if freq == 'raw':
            #self.obs.data[site].plot(legend=True,axes=ax,color='black',label=self.ref_name)
            plot(series=self.obs.data[site],label=self.ref_name,colour='black')
            for name in self.selection():
                m = self.models[name]
                #m.data.raw[site].plot(legend=True,axes=ax,color=m.colour,label=m.name)
                plot(series=m.data.raw[site],label=m.name,colour=m.colour)
        else:
            tf = dt.validate_timeframe(freq).lower()
            _freq = freq == 'y' and 'A' or freq
 
            if model is not None:       
                if not model in self.models:
                    logger.critical("%s not found in %s",model,self.models)
                    return None
                else:
                    plot(series=self.models[model].obs[tf][site].resample(_freq).asfreq(),label=self.ref_name,colour='black')
                    plot(series=self.models[model].data[tf][site].resample(_freq).asfreq(),label=self.models[model].name,colour=self.models[model].colour)
            else:
                if freq == 'd':
                    plot(series=self.obs.data[site].resample(_freq).asfreq(),label=self.ref_name,colour='black')
                elif freq == 'm':
                    plot(series=self.obs.monthly[site].resample(_freq).asfreq(),label=self.ref_name,colour='black')
                elif freq == 'y':
                    plot(series=self.obs.annual[site].resample(_freq).asfreq(),label=self.ref_name,colour='black')

                for m in self._iter_models(freq):
                    try:
                        plot(series=m.data[tf][site].resample(_freq).asfreq(),label=m.name,colour=m.colour)
                    except:
                        logger.warning("no data to plot for %s site %s",m.name,site)

        ax.legend(loc='best')
        ax.set_title("%s" % site)
        ax.set_ylabel(self.var_name)
        ax.set(**kwargs)
        ax.grid()
        return ax
Esempio n. 9
0
 def __init__(self, freq, method, coordinates):
     self.in_coords = coordinates
     self.freq = dt.validate_timeframe(freq)
     if method == 'sum':
         self.method = np.sum
     elif method == 'mean':
         self.method = np.mean
     out_period = dt.resample_dti(self.in_coords.time.index, freq)
     self.out_coords = CoordinateSet([
         TimeCoordinates(awrams_time, out_period), coordinates.latitude,
         coordinates.longitude
     ])
Esempio n. 10
0
    def plot_cdf(self,statistic='pearsons_r',freq='m', **kwargs):
        '''
        Plot the empirical CDF for the specified statistic and frequency
        '''
        tf = dt.validate_timeframe(freq).lower()

        ax = self._get_ax(kwargs)

        for m in self._iter_models(freq):
            y = sorted(m.stats[tf].loc[statistic, m.stats[tf].columns != 'all'].dropna())  # temporary fix for broken cdf's
            ax.plot(np.linspace(0,1.,len(y)),y,color=m.colour,label=m.name)

        ax.set_xlabel("Catchments below (%)")
        ax.set_ylabel(statistic)
        ax.legend(loc='best')
        ax.set(**kwargs)
        ax.grid()
        return ax
Esempio n. 11
0
def resample_data(in_path,
                  in_pattern,
                  variable,
                  period,
                  out_path,
                  to_freq,
                  method,
                  mode='w',
                  enforce_mask=True,
                  extent=None,
                  use_weights=False):
    '''
    method is 'sum' or 'mean'
    if no extent is supplied then the full (unmasked) input will be used
    'use_weights' should be set for unequally binned conversions (monthly->annual means, for example)
    '''
    from glob import glob
    import time
    import numpy as np

    from awrams.utils.messaging import reader as nr
    from awrams.utils.messaging import writer as nw
    from awrams.utils.messaging.brokers import OrderedFanInChunkBroker, FanOutChunkBroker
    from awrams.utils.messaging.general import message
    from awrams.utils.messaging.buffers import create_managed_buffers
    from awrams.utils.processing.chunk_resampler import ChunkedTimeResampler
    from awrams.utils.catchments import subdivide_extent
    from awrams.utils import datetools as dt
    from awrams.utils import mapping_types as mt
    from awrams.utils.io import data_mapping as dm

    start = time.time()

    NWORKERS = 2
    read_ahead = 3
    writemax = 3
    BLOCKSIZE = 128
    nbuffers = (NWORKERS * 2) + read_ahead + writemax

    # Receives all messages from clients
    '''
    Build the 'standard queues'
    This should be wrapped up somewhere else for 
    various topologies...
    '''

    control_master = mp.Queue()

    worker_q = mp.Queue()
    for i in range(NWORKERS):
        worker_q.put(i)

    #Reader Queues
    chunk_out_r = mp.Queue(read_ahead)
    reader_in = dict(control=mp.Queue())
    reader_out = dict(control=control_master, chunks=chunk_out_r)

    #Writer Queues
    chunk_in_w = mp.Queue(writemax)
    writer_in = dict(control=mp.Queue(), chunks=chunk_in_w)
    writer_out = dict(control=control_master)

    #FanIn queues
    fanout_in = dict(control=mp.Queue(), chunks=chunk_out_r, workers=worker_q)
    fanout_out = dict(control=control_master)

    fanin_in = dict(control=mp.Queue())
    fanin_out = dict(control=control_master, out=chunk_in_w, workers=worker_q)

    #Worker Queues
    work_inq = []
    work_outq = []

    for i in range(NWORKERS):
        work_inq.append(mp.Queue())
        fanout_out[i] = work_inq[-1]

        work_outq.append(mp.Queue())
        fanin_in[i] = work_outq[-1]
    '''
    End standard queues...
    '''

    infiles = glob(in_path + '/' + in_pattern)
    if len(infiles) > 1:
        ff = dm.filter_years(period)
    else:
        ff = None

    sfm = dm.SplitFileManager.open_existing(in_path,
                                            in_pattern,
                                            variable,
                                            ff=ff)
    in_freq = sfm.get_frequency()

    split_periods = [period]
    if hasattr(in_freq, 'freqstr'):
        if in_freq.freqstr == 'D':
            #Force splitting so that flat files don't end up getting loaded entirely into memory!
            #Also a bit of a hack to deal with PeriodIndex/DTI issues...
            split_periods = dt.split_period(
                dt.resample_dti(period, 'd', as_period=False), 'a')

    in_periods = [dt.resample_dti(p, in_freq) for p in split_periods]
    in_pmap = sfm.get_period_map_multi(in_periods)

    out_periods = []
    for p in in_periods:
        out_periods.append(dt.resample_dti(p, to_freq))

    if extent is None:
        extent = sfm.ref_ds.get_extent(True)
        if extent.mask.size == 1:
            extent.mask = (np.ones(extent.shape) * extent.mask).astype(np.bool)

    sub_extents = subdivide_extent(extent, BLOCKSIZE)
    chunks = [nr.Chunk(*s.indices()) for s in sub_extents]

    out_period = dt.resample_dti(period, to_freq)
    out_cs = mt.gen_coordset(out_period, extent)

    v = mt.Variable.from_ncvar(sfm.ref_ds.awra_var)
    in_dtype = sfm.ref_ds.awra_var.dtype

    sfm.close_all()

    use_weights = False

    if method == 'mean':
        if dt.validate_timeframe(in_freq) == 'MONTHLY':
            use_weights = True
    '''
    Need a way of formalising multiple buffer pools for different classes of
    work..
    '''

    max_inplen = max([len(p) for p in in_periods])
    bufshape = (max_inplen, BLOCKSIZE, BLOCKSIZE)

    shared_buffers = {}
    shared_buffers['main'] = create_managed_buffers(nbuffers,
                                                    bufshape,
                                                    build=False)

    mvar = mt.MappedVariable(v, out_cs, in_dtype)
    sfm = dm.FlatFileManager(out_path, mvar)

    CLOBBER = mode == 'w'

    sfm.create_files(False, CLOBBER, chunksize=(1, BLOCKSIZE, BLOCKSIZE))

    outfile_maps = {
        v.name:
        dict(nc_var=v.name, period_map=sfm.get_period_map_multi(out_periods))
    }
    infile_maps = {v.name: dict(nc_var=v.name, period_map=in_pmap)}

    reader = nr.StreamingReader(reader_in, reader_out, shared_buffers,
                                infile_maps, chunks, in_periods)
    writer = nw.MultifileChunkWriter(writer_in,
                                     writer_out,
                                     shared_buffers,
                                     outfile_maps,
                                     sub_extents,
                                     out_periods,
                                     enforce_mask=enforce_mask)

    fanout = FanOutChunkBroker(fanout_in, fanout_out)
    fanin = OrderedFanInChunkBroker(fanin_in, fanin_out, NWORKERS, len(chunks))

    fanout.start()
    fanin.start()

    workers = []
    w_control = []
    for i in range(NWORKERS):
        w_in = dict(control=mp.Queue(), chunks=work_inq[i])
        w_out = dict(control=control_master, chunks=work_outq[i])
        w = ChunkedTimeResampler(w_in,
                                 w_out,
                                 shared_buffers,
                                 sub_extents,
                                 in_periods,
                                 to_freq,
                                 method,
                                 enforce_mask=enforce_mask,
                                 use_weights=use_weights)
        workers.append(w)
        w_control.append(w_in['control'])
        w.start()

    writer.start()
    reader.start()

    writer.join()

    fanout_in['control'].put(message('terminate'))
    fanin_in['control'].put(message('terminate'))

    for i in range(NWORKERS):
        w_control[i].put(message('terminate'))

    for x in range(4):
        control_master.get()

    for i in range(NWORKERS):
        workers[i].join()
        control_master.get()

    reader.join()
    fanout.join()
    fanin.join()

    end = time.time()
    logger.info("elapsed time: %ss", end - start)