def setUp(self): data = pd.read_csv( StringIO(tenmin_test_timeseries), parse_dates=[0], usecols=["date", "value", "flags"], index_col=0, header=None, names=("date", "value", "flags"), dtype={ "value": np.float64, "flags": str }, ).asfreq("10T") self.reference_ts = HTimeseries(data=data) self.reference_ts.unit = "°C" self.reference_ts.title = "A test 10-min time series" self.reference_ts.precision = 1 self.reference_ts.time_step = "10min" self.reference_ts.timezone = "EET (UTC+0200)" self.reference_ts.variable = "temperature" self.reference_ts.comment = ("This timeseries is extremely important\n" "because the comment that describes it\n" "spans five lines.\n\n" "These five lines form two paragraphs.") self.reference_ts.location = { "abscissa": 24.6789, "ordinate": 38.12345, "srid": 4326, "altitude": 219.22, "asrid": None, }
def test_write(self): anp = np.array([ [parse_date("2005-08-23 18:53"), 93, ""], [parse_date("2005-08-24 19:52"), 108.7, ""], [parse_date("2005-08-25 23:59"), 28.3, "HEARTS SPADES"], [parse_date("2005-08-26 00:02"), float("NaN"), ""], [parse_date("2005-08-27 00:02"), float("NaN"), "DIAMONDS"], ]) data = pd.DataFrame(anp[:, [1, 2]], index=anp[:, 0], columns=("value", "flags")) ts = HTimeseries(data=data) s = StringIO() ts.write(s) self.assertEqual( s.getvalue(), textwrap.dedent("""\ 2005-08-23 18:53,93,\r 2005-08-24 19:52,108.7,\r 2005-08-25 23:59,28.3,HEARTS SPADES\r 2005-08-26 00:02,,\r 2005-08-27 00:02,,DIAMONDS\r """), )
def _read_timeseries_from_cache_file(self): try: with open(self.filename, newline="\n") as f: return HTimeseries(f) except (FileNotFoundError, ValueError): # If file is corrupted or nonexistent, continue with empty time series return HTimeseries()
def _prepare_resulting_htimeseries_object(self): self.pet = HTimeseries() self.pet.time_step = self.config.time_step self.pet.unit = "mm" self.pet.timezone = self.timezone self.pet.variable = "Potential Evapotranspiration" self.pet.precision = 2 if self.config.time_step == "H" else 1 self.pet.location = self.location
def _get_hts_object(timeseries_id, start_date): timeseries_top = HTimeseries( StringIO(test_timeseries["{}_top".format(timeseries_id)])) if start_date is None or start_date == dt.datetime(1, 1, 1, 0, 1): return timeseries_top assert start_date == timeseries_top.data.index[-1] + dt.timedelta( minutes=1) result = HTimeseries( StringIO(test_timeseries["{}_bottom".format(timeseries_id)])) return result
def _prepare_resulting_htimeseries_object(self): self.pet = HTimeseries() minutes = int(self.config.step.total_seconds() / 60) self.pet.time_step = str(minutes) + ",0" self.pet.unit = "mm" self.pet.timezone = self.timezone self.pet.variable = "Potential Evapotranspiration" self.pet.precision = 2 if self.config.step == dt.timedelta( hours=1) else 1 self.pet.location = self.location
def setUp(self): ahtimeseries = HTimeseries() ahtimeseries.data = pd.DataFrame( index=[datetime(2017, 11, 23, 17, 23), datetime(2018, 11, 25, 1, 0)], data={"value": [1.0, 2.0], "flags": ["", ""]}, columns=["value", "flags"], ) station = mommy.make(models.Station) timeseries = mommy.make( models.Timeseries, gentity=station, time_zone__utc_offset=120 ) with patch("enhydris.models.Timeseries.get_data", return_value=ahtimeseries): self.response = self.client.get( "/api/stations/{}/timeseries/{}/data/".format(station.id, timeseries.id) )
def get(self): result = HTimeseries() for filename in self.filenames: f = gdal.Open(filename) try: isostring = f.GetMetadata()["TIMESTAMP"] timestamp = iso8601.parse_date(isostring, default_timezone=None) value = extract_point_from_raster(self.point, f) result.data.loc[timestamp, "value"] = value result.data.loc[timestamp, "flags"] = "" finally: f = None result.data = result.data.sort_index() return result
def test_daily(self): self.setup_daily_input_files() self.setup_config_file("D") # Verify the output file doesn't exist yet result_filename = os.path.join(self.tempdir, "evaporation.hts") assert not os.path.exists(result_filename) # Execute cli.App(self.config_file).run() # Check that it has created a file and that the file is correct with open(result_filename) as f: t = HTimeseries(f) expected_result = pd.DataFrame( data={ "value": [3.9], "flags": [""] }, columns=["value", "flags"], index=[dt.datetime(2014, 7, 6)], ) expected_result.index.name = "date" pd.testing.assert_frame_equal(t.data, expected_result, check_less_precise=1)
def _get_input_timeseries_for_var(self, var): filename = os.path.join(self.config.base_dir, getattr(self.config, var + "_prefix") + ".hts") if not os.path.exists(filename): return with open(filename, "r") as f: self.input_timeseries[var] = HTimeseries(f)
def setUpTestData(cls): cls._create_test_timeseries() ahtimeseries = HTimeseries( StringIO("2020-09-08 20:00,15.7,,\n2020-09-08 21:00,,\n") ) models.TimeseriesRecord.bulk_insert(cls.timeseries, ahtimeseries) cls.timeseries_records = models.TimeseriesRecord.objects.all()
def _get_htimeseries_from_data(self, data): if isinstance(data, HTimeseries): return data elif isinstance(data, pd.DataFrame): return HTimeseries(data) else: return HTimeseries.read(data)
def _read_timeseries_from_stream(self, stream): try: return HTimeseries(stream) except UnicodeDecodeError as e: raise forms.ValidationError( _("The file does not seem to be a valid UTF-8 file: " + str(e)))
def create_timeseries(self): self.htimeseries = HTimeseries() self.htimeseries.data = pd.DataFrame( index=[dt.datetime(2017, 11, 23, 17, 23), dt.datetime(2018, 11, 25, 1, 0)], data={"value": [1.0, 2.0], "flags": ["", ""]}, columns=["value", "flags"], ) self.station = mommy.make( models.Station, name="Komboti", geom=Point(x=21.00000, y=39.00000, srid=4326), original_srid=4326, ) self.time_zone = mommy.make(models.TimeZone, code="EET", utc_offset=120) self.variable = models.Variable() with switch_language(self.variable, "en"): self.variable.descr = "Beauty" self.variable.save() self.timeseries_group = mommy.make( models.TimeseriesGroup, gentity=self.station, time_zone=self.time_zone, precision=2, variable=self.variable, ) self.timeseries = mommy.make( models.Timeseries, type=models.Timeseries.RAW, timeseries_group=self.timeseries_group, ) self.timeseries.set_data(self.htimeseries.data)
def _get_timeseries_if_file_is_up_to_date_else_none(self, dest): with open(dest, "r", newline="") as f: ts = HTimeseries(f) for filename in self.filenames: if not self.filename_format.get_date(filename) in ts.data.index: return None return ts
def _get_timeseries_without_moving_file_position(self, datastream): original_position = datastream.tell() wrapped_datastream = TextIOWrapper(datastream, encoding="utf-8", newline="\n") result = HTimeseries.read(wrapped_datastream) wrapped_datastream.detach() # If we don't do this the datastream will be closed datastream.seek(original_position) return result
def test_read_csv_with_duplicates_raises_error(self): s = StringIO(self.csv_with_duplicates) s.seek(0) msg = ( "Can't read time series: the following timestamps appear more than once: " "2020-02-23 12:00:00, 2020-02-23 13:00:00") with self.assertRaisesRegex(ValueError, msg): HTimeseries(s)
def get_data(self, start_date=None, end_date=None): if self.datafile: with open(self.datafile.path, "r", newline="\n") as f: result = HTimeseries.read(f, start_date=start_date, end_date=end_date) else: result = HTimeseries() self._set_extra_timeseries_properties(result) return result
def setUp(self): s = StringIO(tenmin_test_timeseries) s.seek(0) self.ts = HTimeseries( s, start_date=dt.datetime(2008, 2, 7, 11, 30), end_date=dt.datetime(2008, 2, 7, 11, 55), )
def process_timeseries(self): self.source_end_date = self.htimeseries.data.index[-1] try: regularized = self._regularize_time_series(self.htimeseries) except RegularizeError as e: logging.getLogger("enhydris.autoprocess").error(str(e)) return HTimeseries() aggregated = self._aggregate_time_series(regularized) return self._trim_last_record_if_not_complete(aggregated)
def _upload_all_new_data(self): station_id = self._meteologger_storage.station_id sorted_ts_end_dates = sorted(self._ts_end_dates.items(), key=lambda x: x[1]) for cts_id, ts_end_date in sorted_ts_end_dates: new_data = self._meteologger_storage.get_recent_data( cts_id.timeseries_group_id, ts_end_date) if len(new_data): self.client.post_tsdata(station_id, *cts_id, HTimeseries(new_data))
def setUp(self): source_timeseries = pd.DataFrame( data={ "value": [42], "flags": [""] }, columns=["value", "flags"], index=[dt.datetime(2019, 5, 21, 11, 20)], ) self.aggregation._htimeseries = HTimeseries(source_timeseries) self.aggregation._htimeseries.time_step = ""
def test_execute(self): self.range_check = mommy.make( RangeCheck, lower_bound=2, upper_bound=5, soft_lower_bound=3, soft_upper_bound=4, ) self.range_check.checks._htimeseries = HTimeseries( self.source_timeseries) result = self.range_check.checks.process_timeseries() pd.testing.assert_frame_equal(result, self.expected_result)
def test_execute(self): self.roc_check = mommy.make(RateOfChangeCheck) mommy.make( RateOfChangeThreshold, rate_of_change_check=self.roc_check, delta_t="10min", allowed_diff=7.0, ) self.roc_check.checks._htimeseries = HTimeseries( self.source_timeseries) result = self.roc_check.checks.process_timeseries() pd.testing.assert_frame_equal(result, self.expected_result)
def get_data(self, start_date=None, end_date=None): data = cache.get_or_set(f"timeseries_data_{self.id}", self._get_all_data_as_pd) if start_date: start_date = start_date.astimezone(self.time_zone.as_tzinfo) start_date = start_date.replace(tzinfo=None) if end_date: end_date = end_date.astimezone(self.time_zone.as_tzinfo) end_date = end_date.replace(tzinfo=None) data = data.loc[start_date:end_date] result = HTimeseries(data) self._set_extra_timeseries_properties(result) return result
def test_file_is_not_recreated(self): hspatial.PointTimeseries(self.point, prefix=self.prefix).get_cached(self.dest) # Make existing file read-only os.chmod(self.dest, S_IREAD | S_IRGRP | S_IROTH) # Try again—it shouldn't try to write, therefore it shouldn't raise exception hspatial.PointTimeseries(self.point, prefix=self.prefix).get_cached(self.dest) with open(self.dest, "r", newline="\n") as f: self._check_against_expected(HTimeseries(f))
def test_execute(self): station = mommy.make(Station) self.curve_interpolation = mommy.make( CurveInterpolation, timeseries_group__gentity=station, target_timeseries_group__gentity=station, ) self._setup_period1() self._setup_period2() self.curve_interpolation._htimeseries = HTimeseries( self.source_timeseries) result = self.curve_interpolation.process_timeseries() pd.testing.assert_frame_equal(result, self.expected_result)
def _execute(self, max_missing): station = mommy.make(Station) self.aggregation = mommy.make( Aggregation, timeseries_group__gentity=station, timeseries_group__variable__descr="Hello", target_time_step="H", method="sum", max_missing=max_missing, resulting_timestamp_offset="1min", ) self.aggregation._htimeseries = HTimeseries(self.source_timeseries) self.aggregation._htimeseries.time_step = "10min" return self.aggregation.process_timeseries().data
def test_execute(self): station = mommy.make(Station) self.range_check = mommy.make( RangeCheck, lower_bound=2, upper_bound=5, soft_lower_bound=3, soft_upper_bound=4, station=station, source_timeseries__gentity=station, target_timeseries__gentity=station, ) self.range_check.htimeseries = HTimeseries(self.source_timeseries) result = self.range_check.process_timeseries() pd.testing.assert_frame_equal(result, self.expected_result)
def create_timeseries(self): self.htimeseries = HTimeseries() self.htimeseries.data = pd.DataFrame( index=[datetime(2017, 11, 23, 17, 23), datetime(2018, 11, 25, 1, 0)], data={"value": [1.0, 2.0], "flags": ["", ""]}, columns=["value", "flags"], ) self.station = mommy.make(models.Station) self.timeseries = mommy.make( models.Timeseries, id=42, gentity=self.station, time_zone__utc_offset=120, precision=2, )
def _time_step(self): """ Return time step of all time series. If time step is not the same for all time series, raises exception. """ time_step = None for filename in self.config.files: with open(filename) as f: t = HTimeseries(f, start_date="0001-01-01 00:00") item_time_step = t.time_step if time_step and (item_time_step != time_step): raise click.ClickException( "Not all time series have the same step") time_step = item_time_step return time_step
def h_integrate(mask, stations_layer, date, output_filename_prefix, date_fmt, funct, kwargs): date_fmt_for_filename = date.strftime(date_fmt).replace(" ", "-").replace( ":", "-") output_filename = "{}-{}.tif".format(output_filename_prefix, date.strftime(date_fmt_for_filename)) if not _needs_calculation(output_filename, date, stations_layer): return # Read the time series values and add the 'value' attribute to # stations_layer stations_layer.CreateField(ogr.FieldDefn("value", ogr.OFTReal)) input_files = [] stations_layer.ResetReading() for station in stations_layer: filename = station.GetField("filename") with open(filename, newline="\n") as f: t = HTimeseries(f) try: value = t.data.loc[date.replace(tzinfo=None), "value"] except KeyError: value = np.nan station.SetField("value", value) if not isnan(value): input_files.append(filename) stations_layer.SetFeature(station) if not input_files: return # Create destination data source output = gdal.GetDriverByName("GTiff").Create(output_filename, mask.RasterXSize, mask.RasterYSize, 1, gdal.GDT_Float32) output.SetMetadataItem("TIMESTAMP", date.strftime(date_fmt)) output.SetMetadataItem("INPUT_FILES", "\n".join(input_files)) try: # Set geotransform and projection in the output data source output.SetGeoTransform(mask.GetGeoTransform()) output.SetProjection(mask.GetProjection()) # Do the integration integrate(mask, stations_layer, output.GetRasterBand(1), funct, kwargs) finally: # Close the dataset output = None
def _get_all_data_as_pd(self): tzoffsetstring = self._get_tzoffsetstring_for_pg() with connection.cursor() as cursor: cursor.execute( """ SELECT STRING_AGG( TO_CHAR(timestamp at time zone %s, 'YYYY-MM-DD HH24:MI') || ',' || value || ',' || flags, E'\n' ORDER BY timestamp ) || E'\n' FROM enhydris_timeseriesrecord WHERE timeseries_id=%s; """, [tzoffsetstring, self.id], ) return HTimeseries(StringIO(cursor.fetchone()[0])).data
def _needs_calculation(output_filename, date, stations_layer): """ Used by h_integrate to check whether the output file needs to be calculated or not. It does not need to be calculated if it already exists and has been calculated from all available data. """ # Return immediately if output file does not exist if not os.path.exists(output_filename): return True # Get list of files which were used to calculate the output file fp = gdal.Open(output_filename) try: actual_input_files = fp.GetMetadataItem("INPUT_FILES") if actual_input_files is None: raise IOError( "{} does not contain the metadata item INPUT_FILES".format( output_filename)) finally: fp = None # Close file actual_input_files = set(actual_input_files.split("\n")) # Get list of files available for calculating the output file stations_layer.ResetReading() available_input_files = set([ station.GetField("filename") for station in stations_layer if os.path.exists(station.GetField("filename")) ]) # Which of these files have not been used? unused_files = available_input_files - actual_input_files # For each one of these files, check whether it has newly available data. # Upon finding one that does, the verdict is made: return True for filename in unused_files: with open(filename, newline="\n") as f: t = HTimeseries(f) try: value = t.data.loc[date.replace(tzinfo=None), "value"] if not isnan(value): return True except KeyError: continue # We were unable to find data that had not already been used return False