def __init__(self): self._record_regex = None self._header_approval_regexes = None self._header_regexs = [r'"?(?P<key>\w*)"?\s?=\s?"?(?P<value>\w*)"?',] self._prop_key_table = dict() self._prop_value_table = {"m": "meter"} self.n_lines_to_check = 300 # Default timezone table self._timezone_table = { 'PST': vtt.hours(0), 'PDT': vtt.hours(-1), 'LST': vtt.hours(0) } self._comment_indicators = '#'
def plot_metrics_to_figure(fig, tss, title=None, window_inst=None, window_avg=None, labels=None, max_shift=hours(2), period=minutes(int(12.24 * 60)), label_loc=1, legend_size=12): """ Plot a metrics plot Returns ------- matplotlib.figure.Figure """ grids = gen_metrics_grid() axes = dict(zip(grids.keys(), map(fig.add_subplot, grids.values()))) if labels is None: labels = [ts.props.get('label') for ts in tss] plot_inst_and_avg(axes, tss, window_inst, window_avg, labels, label_loc, legend_size) if title is not None: axes['inst'].set_title(title) if window_avg is not None: tss_clipped = [safe_window(ts, window_avg) for ts in tss] else: tss_clipped = tss lags = calculate_lag_of_tss(tss_clipped, max_shift, period) metrics, tss_scatter = calculate_metrics(tss_clipped, lags) if tss_scatter is not None: ax_scatter = axes['scatter'] plot_scatter(ax_scatter, tss_scatter) unit = tss[0].props.get('unit') if tss[0] is not None else None str_metrics = gen_metrics_string(metrics, labels[1:], unit) write_metrics_string(axes['inst'], str_metrics) return fig
def separate_species(ts,noise_thresh_min=40): """Separate species into subtidal, diurnal, semidiurnal and noise components Input: ts: timeseries to be decomposed into species, assumed to be at six minute intervals. The filters used have long lenghts, so avoid missing data and allow for four extra days worth of data on each end. Output: four regular time series, representing subtidal, diurnal, semi-diurnal and noise """ # the first filter eliminates noise ts_denoise= cosine_lanczos(ts,cutoff_period=minutes(noise_thresh_min)) ts_noise=ts-ts_denoise # this is the residual, the part that IS noise # the filter length assumes 6 minute data. The resulting filter is 90 hours # long which is MUCH longer than the default because this filter has to be # really sharp assert ts.index.freq == minutes(6) # 14.5 hours = 870min ts_diurnal_and_low=cosine_lanczos(ts_denoise,cutoff_period=minutes(870), filter_len=900) ts_semidiurnal_and_high=ts_denoise-ts_diurnal_and_low # The resulting filter is again 90 hours # long which is still a bit longer than the default. Again, # we want this filter to be pretty sharp. #ts_sub_tide=cosine_lanczos(ts_diurnal_and_low,cutoff_period=hours(40), # filter_len=900) ts_sub_tide=cosine_lanczos(ts_denoise,cutoff_period=hours(40), filter_len=900) ts_diurnal=ts_diurnal_and_low-ts_sub_tide return ts_sub_tide,ts_diurnal,ts_semidiurnal_and_high, ts_noise
def test_lanczos_cos_filter_len(self): """ test cosine lanczos input filter length api""" data=[2.0*numpy.math.cos(2*pi*i/5+0.8)+3.0*numpy.math.cos(2*pi*i/45+0.1)\ +7.0*numpy.math.cos(2*pi*i/55+0.3) for i in range(1000)] data = numpy.array(data) st = datetime.datetime(year=1990, month=2, day=3, hour=11, minute=15) delta = time_interval(hours=1) ts = rts(data, st, delta, {}) filter_len = 24 t1 = cosine_lanczos(ts, cutoff_period=hours(30), filter_len=filter_len) filter_len = days(1) t2 = cosine_lanczos(ts, cutoff_period=hours(30), filter_len=filter_len) assert_array_equal(t1.data, t2.data) filter_len = "invalid" self.assertRaises(TypeError,cosine_lanczos,ts,cutoff_period=hours(30),\ filter_len=filter_len)
def test_lanczos_cos_filter_len_api(self): """ Test the filter len api of the cosine filter""" ## a signal that is sum of two sine waves with frequency of ## 5 and 250HZ, sampled at 2000HZ t = numpy.linspace(0, 1.0, 2001) xlow = numpy.sin(2 * numpy.pi * 5 * t) xhigh = numpy.sin(2 * numpy.pi * 250 * t) x = xlow + xhigh st = datetime.datetime(year=1990, month=2, day=3, hour=11, minute=15) delta = time_interval(hours=1) ts = rts(x, st, delta, {}) ## filter len is none nt1 = cosine_lanczos(ts, cutoff_period=hours(40), padtype="even") self.assertTrue(nt1.is_regular()) ## filter len by defaut lis 40*1.25=50, use it explicitly and ## see if the result is the same as the nt1 nt2 = cosine_lanczos(ts, cutoff_period=hours(40), filter_len=50, padtype="even") self.assertEqual(numpy.abs(nt1.data - nt2.data).max(), 0)
def fill_gaps(ts, max_gap_to_fill=None): if max_gap_to_fill is None or max_gap_to_fill == hours(0): return ts try: limit = int(max_gap_to_fill / ts.index.freq) except: raise ValueError("could not divide max_gap_to_fill by freq: {}".format( ts.index.freq)) if limit == 0: raise ValueError("max_gap_to_fill must be longer than time step") unit = ts.unit ts = ts.interpolate(method='time', limit=limit) ts.unit = unit return ts
def plot_metrics_to_figure(fig, tss, title=None, window_inst=None, window_avg=None, labels=None, max_shift=hours(4), period=minutes(int(12.24 * 60)), label_loc=1, legend_size=12): """ Plot a metrics plot Returns ------- matplotlib.figure.Figure """ grids = gen_metrics_grid() axes = dict( list( zip(list(grids.keys()), list(map(fig.add_subplot, list(grids.values())))))) plot_inst_and_avg(axes, tss, window_inst, window_avg, labels, label_loc, legend_size) if title is not None: axes['inst'].set_title(title) if window_avg is not None: tss_clipped = [safe_window(ts, window_avg) for ts in tss] else: tss_clipped = tss lags = calculate_lag_of_tss(tss_clipped, max_shift, minutes(1)) metrics, tss_scatter = calculate_metrics(tss_clipped, lags) unit = tss[1].unit # Get from the simulation if tss_scatter is not None: if tss_scatter[0] is not None: tss_scatter[0].unit = unit tss_scatter[1].unit = unit ax_scatter = axes['scatter'] plot_scatter(ax_scatter, tss_scatter) str_metrics = gen_metrics_string(metrics, labels[1:], unit) write_metrics_string(axes['inst'], str_metrics) return fig
def test_lanczos_cos_filter_nan(self): """ Test the data with a nan filtered by cosine lanczos filter""" data=[2.0*numpy.math.cos(2*pi*i/5+0.8)+3.0*numpy.math.cos(2*pi*i/45+0.1)\ +7.0*numpy.math.cos(2*pi*i/55+0.3) for i in range(1000)] data = numpy.array(data) nanloc = 336 data[nanloc] = numpy.nan st = datetime.datetime(year=1990, month=2, day=3, hour=11, minute=15) delta = time_interval(hours=1) ts = rts(data, st, delta, {}) m = 20 nt1 = cosine_lanczos(ts, cutoff_period=hours(30), filter_len=m, padtype="even") ## result should have nan from nanidx-2*m+2 to nanidx+2*m-1 nanidx = numpy.where(numpy.isnan(nt1.data))[0] nanidx_should_be = numpy.arange(nanloc - 2 * m, nanloc + 2 * m + 1) assert_array_equal(nanidx, nanidx_should_be)
def test_lanczos_cos_filter_phase_neutral(self): """ Test the phase neutriality of cosine lanczos filter""" ## a signal that is sum of two sine waves with frequency of ## 5 and 250HZ, sampled at 2000HZ t = numpy.linspace(0, 1.0, 2001) xlow = numpy.sin(2 * numpy.pi * 5 * t) xhigh = numpy.sin(2 * numpy.pi * 250 * t) x = xlow + xhigh st = datetime.datetime(year=1990, month=2, day=3, hour=11, minute=15) delta = time_interval(hours=1) ts = rts(x, st, delta, {}) ## cutoff period is 30 hours, filterd result should be xlow ## approximately nt1 = cosine_lanczos(ts, cutoff_period=hours(30), filter_len=20, padtype="odd") self.assertAlmostEqual(numpy.abs(nt1.data - xlow).max(), 0, places=1)
def filter_timeseries(tss, cutoff_period=hours(40)): """ Filter time series Parameters ---------- Returns ------- list of vtools.data.timeseries.TimeSeries filtered time series """ filtered = [] for ts in tss: if ts is None: filtered.append(None) else: ts_filtered = cosine_lanczos(ts, cutoff_period=cutoff_period) ts_filtered.filtered = 'cosine_lanczos' ts_filtered.unit = ts.unit filtered.append(ts_filtered) return filtered
def test_lanczos_cos_filter_period_freq_api(self): """ Test the cutoff period and frequency of filter""" ## a signal that is sum of two sine waves with frequency of ## 5 and 250HZ, sampled at 2000HZ t = numpy.linspace(0, 1.0, 2001) xlow = numpy.sin(2 * numpy.pi * 5 * t) xhigh = numpy.sin(2 * numpy.pi * 250 * t) x = xlow + xhigh st = datetime.datetime(year=1990, month=2, day=3, hour=11, minute=15) delta = time_interval(hours=1) ts = rts(x, st, delta, {}) nt1=cosine_lanczos(ts,cutoff_period=hours(30),filter_len=20,\ padtype="even") ## cutoff_frequency is expressed as ratio of nyquist frequency ## ,which is 1/0.5/hours cutoff_frequency = 2.0 / 30 nt2=cosine_lanczos(ts,cutoff_frequency=cutoff_frequency,filter_len=20,\ padtype="even") self.assertEqual(numpy.abs(nt1.data - nt2.data).max(), 0)
def read(self, fpath, start=None, end=None, force_regular=True, selector=None): """ Read a text file with the given pattern and parsers. Parsers and a pattern must be defined and set in the child class. Parameters ---------- fpath: str file to read start: datetime.datetime, optional datetime to start reading in. If None, read from the start of the file end: datetime.datetime, optional datetime to finish reading in. If None, read till the end of the file force_regular: boolean, optional If it is true, it returns a regular time series Returns ------- vtools.data.timeseries.TimeSeries time series from the file """ # The selector (if it exists) can probably be precalculated or at least recorded. # Almost always this amounts to picking variables out of a list of column names # and recording indexes, but here we don't ask any questions about what "selector" is. n_headerlines, metadata = self.process_header(fpath, selector) metadata = dict() if not self._header_regexs is None: metadata = self.read_metadata_from_header(fpath) print "Here we are working on %s" % fpath with open(fpath, 'r') as f_in: times = list() values = list() # fast forward past header if n_headerlines > 0: for i in range(n_headerlines): f_in.readline() # process lines starting from current file pointer for i, line in enumerate(f_in): if self.is_comment(line): continue timestamp, vals = self.parse_record(line) if start and timestamp < start: continue if end and timestamp > end: break times.append(timestamp) values.append(vals) if len(times) < 1: return None arr = numpy.array(values) # Here I assume that it is more effective to retrieve too much # in the reading stage and then do this with numpy fancy indexing. # I But you can override this function arr = self.cull_using_selector(arr) ts = vts.its(times, numpy.array(values)) if force_regular: interval = vt_infer_interval(times[:11], fraction=0.5, standard=[ vtt.minutes(6), vtt.minutes(10), vtt.minutes(15), vtt.hours(1) ]) if not interval: for t in times[:10]: print t.strftime("%Y-%m-%d %H:%M:%S") raise ValueError( "Interval could not be inferred from first time steps in %s" % fpath) import warnings # todo: this really should be an option with warnings.catch_warnings(): warnings.simplefilter("ignore") ts = vts.its2rts(ts, interval) if start is not None: if start < ts.start: ts = vts.extrapolate_ts(ts, start=start) else: ts = ts.window(start=start) if end is not None: if end > ts.end: ts = vts.extrapolate_ts(ts, end=end) else: ts = ts.window(end=end) for k, v in metadata.iteritems(): ts.props[k] = v return ts
def plot(self): """ Generate metrics plots """ # Process input parameters params = self.params variable = params['variable'] outputs_dir = params['outputs_dir'] if isinstance(outputs_dir, str): outputs_dir = outputs_dir.split() time_basis = process_time_str(params['time_basis']) stations_input = params.get('stations_input') if stations_input is None: stations_input = params.get( 'flow_station_input') if variable == "flow" else params.get( 'station_input') else: raise ValueError( "Old style input file. \nUse 'station_input' and 'flow_station_input' respectively for staout* and flow.dat" ) if isinstance(stations_input, str): stations_input = stations_input.split() db_stations = station.read_station_dbase(params['stations_csv']) db_obs = station.read_obs_links(params['obs_links_csv']) excluded_stations = params.get('excluded_stations') selected_stations = params.get('selected_stations') start_avg = process_time_str(params["start_avg"]) end_avg = process_time_str(params["end_avg"]) start_inst = process_time_str(params["start_inst"]) end_inst = process_time_str(params["end_inst"]) labels = params['labels'] dest_dir = params.get('dest_dir') if dest_dir is None: dest_dir = '.' else: if not os.path.exists(dest_dir): os.mkdir(dest_dir) plot_format = params.get('plot_format') padding = days(4) window_common = (min(start_inst, start_avg), max(end_inst, end_avg)) window_to_read = (window_common[0] - padding, window_common[1] + padding) plot_all = read_optional_flag_param(params, 'plot_all') remove_outliers = read_optional_flag_param(params, 'remove_outliers') adjust_datum = read_optional_flag_param(params, 'auto_adjustment') fill_gap = read_optional_flag_param(params, 'fill_gap') max_gap_to_fill = hours(1) if 'max_gap_to_fill' in params: max_gap_to_fill = pd.tseries.frequencies.to_offset( params['max_gap_to_fill']) else: max_gap_to_fill = hours(1) # Prepare readers of simulation outputs sim_outputs = self.read_simulation_outputs(variable, outputs_dir, time_basis, stations_input) assert len(sim_outputs) > 0 assert sim_outputs[0] is not None # Iterate through the stations in the first simulation outputs for stn in sim_outputs[0].columns: station_id = stn[0] if type(stn) == tuple else stn # Prepare self.logger.info( "==================================================") self.logger.info( "Start processing station:: {}".format(station_id)) if not station_id in db_stations.index: self.logger.warning( "Station id {} not found in station listings".format( station_id)) continue alias = db_stations.loc[station_id, 'alias'] if selected_stations is not None: if station_id not in selected_stations: self.logger.info( "Skipping..." " Not in the list of the selected stations: %s", station_id) continue if excluded_stations is not None: if station_id in excluded_stations: self.logger.info( "Skipping... " "In the list of the excluded stations: %s", station_id) continue if variable == 'flow': vert_pos = 'default' else: vert_pos = stn[1] adj_obs = 0. # Read Obs subloc = 'default' if variable == 'flow' else stn[1] ts_obs = self.retrieve_ts_obs(station_id, subloc, variable, window_to_read, db_stations, db_obs) if ts_obs is None or ts_obs.isnull().all(): self.logger.warning("No observation data: %s.", station_id) if plot_all is False: self.logger.warning("Skipping this station") continue else: if remove_outliers is True: self.logger.info("Removing outliers...") ts_obs = med_outliers(ts_obs, level=3, copy=False) adj = db_obs.loc[(station_id, subloc, variable), 'datum_adj'] if adj is not None and adj != 0.: self.logger.info( "Adjusting obs value with the value in the table...") ts_obs += adj if obs_unit == 'ft': adj = ft_to_m(adj) else: ValueError("Not supported unit for adjustment.") adj_obs += adj try: obs_unit = db_obs.loc[(station_id, subloc, variable), 'unit'] ts_obs = self.convert_unit_of_ts_obs_to_SI( ts_obs, obs_unit) obs_unit = ts_obs.unit except Exception as e: raise Exception("Station {}".format(station_id)) from e # Read simulation if variable == "flow": tss_sim = [ None if simout[station_id].isnull().all() else simout[station_id] for simout in sim_outputs ] else: tss_sim = [ simout.loc[:, (station_id, subloc)].iloc[:, 0] for simout in sim_outputs ] for ts in tss_sim: if ts is None: continue if ts_obs is None or ts_obs.isnull().all(): ts.unit = self.variable_units[variable] else: ts.unit = obs_unit # Adjust datum if necessary if adjust_datum and ts_obs is not None: ts_obs, adj = self.adjust_obs_datum(ts_obs, tss_sim[0], station_id, variable, db_obs) adj_obs += adj if ts_obs is not None and fill_gap is True: self.logger.info("Filling gaps in the data.") ts_obs = fill_gaps(ts_obs, max_gap_to_fill) # Plot if check_if_all_tss_are_bad([ts_obs] + tss_sim): self.logger.error("None of time series have data.") continue self.logger.info("Start plotting...") source = db_obs.loc[(station_id, subloc, variable), "agency"].upper() figtitle = self.create_title(db_stations, station_id, source, variable, vert_pos) title = None if type(tss_sim) == list: tss_sim = tuple(tss_sim) # labels labels_to_plot = deepcopy(labels) if adj_obs != 0.: if adj_obs > 0.: labels_to_plot[0] += " + {:g}".format(adj_obs) else: labels_to_plot[0] += " - {:g}".format(-adj_obs) if plot_format == 'simple': fig = plot_comparison(ts_obs, tss_sim, window_inst=(start_inst, end_inst), window_avg=(start_avg, end_avg), labels=labels_to_plot, title=title) else: fig = plot_metrics(ts_obs, tss_sim, window_inst=(start_inst, end_inst), window_avg=(start_avg, end_avg), labels=labels_to_plot, title=title) fname_output = self.set_fname_out(alias, variable, station_id, vert_pos) fpath_output = os.path.join(dest_dir, fname_output + '.png') fig.suptitle(figtitle, fontsize=14) fig.savefig(fpath_output, dpi=300) self.logger.info("Done for the station.")
def plot(self): """ Generate metrics plots """ # Process input parameters params = self.params variable = params['variable'] outputs_dir = params['outputs_dir'] if isinstance(outputs_dir, str): outputs_dir = outputs_dir.split() time_basis = process_time_str(params['time_basis']) stations_input = params.get('stations_input') if stations_input is None: stations_input = params.get( 'flow_station_input') if variable == "flow" else params.get( 'station_input') else: raise ValueError( "Old style input file. \nUse 'station_input' and 'flow_station_input' respectively for staout* and flow.dat" ) if isinstance(stations_input, str): stations_input = stations_input.split() db_stations = station_db.StationDB(params['stations_csv']) db_obs = obs_links.ObsLinks(params['obs_links_csv']) excluded_stations = params.get('excluded_stations') selected_stations = params.get('selected_stations') start_avg = process_time_str(params["start_avg"]) end_avg = process_time_str(params["end_avg"]) start_inst = process_time_str(params["start_inst"]) end_inst = process_time_str(params["end_inst"]) labels = params['labels'] dest_dir = params.get('dest_dir') if dest_dir is None: dest_dir = '.' else: if not os.path.exists(dest_dir): os.mkdir(dest_dir) plot_format = params.get('plot_format') padding = days(4) window_common = (min(start_inst, start_avg), max(end_inst, end_avg)) window_to_read = (window_common[0] - padding, window_common[1] + padding) plot_all = read_optional_flag_param(params, 'plot_all') remove_outliers = read_optional_flag_param(params, 'remove_outliers') adjust_datum = read_optional_flag_param(params, 'auto_adjustment') fill_gap = read_optional_flag_param(params, 'fill_gap') max_gap_to_fill = hours(1) if 'max_gap_to_fill' in params: max_gap_to_fill = parse_interval(params['max_gap_to_fill']) # Prepare readers of simulation outputs sim_outputs = self.read_simulation_outputs(variable, outputs_dir, time_basis, stations_input) assert len(sim_outputs) > 0 assert sim_outputs[0] is not None # Iterate through the stations in the first simulation outputs for station in sim_outputs[0].stations: # Prepare self.logger.info( "==================================================") self.logger.info("Start processing a station: %s", station["name"]) station_id = station['name'] alias = db_stations.alias(station_id) if selected_stations is not None: if station_id not in selected_stations: self.logger.info( "Skipping..." " Not in the list of the selected stations: %s", station_id) continue if excluded_stations is not None: if station_id in excluded_stations: self.logger.info( "Skipping... " "In the list of the excluded stations: %s", station_id) continue if not variable == 'flow': vert_pos = station['vert_pos'] else: vert_pos = 0 adj_obs = 0. # Read Obs ts_obs = self.retrieve_ts_obs(station_id, variable, window_to_read, db_stations, db_obs, vert_pos) if ts_obs is None: self.logger.warning("No observation data: %s.", station_id) if plot_all is False: self.logger.warning("Skipping this station") continue else: if remove_outliers is True: self.logger.info("Removing outliers...") ts_obs, filtered = med_outliers(ts_obs, copy=False) adj = db_obs.adjustment(station_id, variable) if adj is not None and adj != 0.: self.logger.info( "Adjusting obs value with the value in the table...") ts_obs += adj obs_unit = db_obs.unit(station_id, variable, vert_pos) if obs_unit == 'ft': adj = ft_to_m(adj) else: ValueError("Not supported unit for adjustment.") adj_obs += adj if 'unit' not in ts_obs.props: ts_obs.props['unit'] = db_obs.unit(station_id, variable) ts_obs = self.convert_unit_of_ts_obs_to_SI(ts_obs) # Read simulation tss_sim = self.retrieve_tss_sim(sim_outputs, station_id, variable, vert_pos) # Adjust datum if necessary if adjust_datum is True and ts_obs is not None: ts_obs, adj = self.adjust_obs_datum(ts_obs, tss_sim[0], station_id, variable, db_obs) adj_obs += adj if ts_obs is not None and fill_gap is True: self.logger.info("Filling gaps in the data.") fill_gaps((ts_obs, ), max_gap_to_fill) # Plot if check_if_all_tss_are_bad([ ts_obs, ] + tss_sim): self.logger.error("None of time series have data.") continue self.logger.info("Start plotting...") source = db_obs.agency(station_id, variable).upper() figtitle = self.create_title(db_stations, station_id, source, variable, vert_pos) title = None # labels labels_to_plot = deepcopy(labels) if adj_obs != 0.: if adj_obs > 0.: labels_to_plot[0] += " + %g" % adj_obs else: labels_to_plot[0] += " - %g" % (-adj_obs) if plot_format == 'simple': fig = plot_comparison(ts_obs, *tss_sim, window_inst=(start_inst, end_inst), window_avg=(start_avg, end_avg), labels=labels_to_plot, title=title) else: fig = plot_metrics(ts_obs, *tss_sim, window_inst=(start_inst, end_inst), window_avg=(start_avg, end_avg), labels=labels_to_plot, title=title) fname_output = self.set_fname_out(alias, variable, station_id, vert_pos) fpath_output = os.path.join(dest_dir, fname_output + '.png') fig.suptitle(figtitle, fontsize=14) fig.savefig(fpath_output, dpi=300) self.logger.info("Done for the station.")
# -*- coding: utf-8 -*- from vtools.functions.shift import * from vtools.data.sample_series import * from vtools.data.vtime import hours import matplotlib.pyplot as plt ts = synthetic_tide_series() ts_shifted = shift(ts, hours(2)) fig = plt.figure() ax0 = fig.add_subplot(111) ax0.set_ylabel("surface (feet)") p0 = ax0.plot(ts.times, ts.data, color='g', linewidth=1.0) p1 = ax0.plot(ts_shifted.times, ts_shifted.data, color='r', linewidth=1.0) plt.legend(["Surface", "Shifted"]) plt.grid(b=True, which='major', color='0.9', linestyle='-', linewidth=0.5) fig.autofmt_xdate() plt.show()