def demo(): start_date = '2016-01-01T00:00' end_date = '2016-03-31T23:59' from filter_weather_data.filters import StationRepository from .nearest_k_finder import NearestKFinder station_repository = StationRepository() station_dicts = station_repository.load_all_stations(start_date, end_date, limit=20) chosen_index = 9 search_for = station_dicts[chosen_index] print("picked", station_dicts[chosen_index]["name"], "to look for") del station_dicts[ chosen_index] # otherwise triangles will contain the searched point as well. k_nearest_finder = NearestKFinder(station_dicts, start_date, end_date) t = search_for["data_frame"].index.values[0] neighbours = k_nearest_finder.find_k_nearest_neighbours(search_for, t, 3) t_actual = search_for["data_frame"].loc[t].temperature result = get_interpolation_results(neighbours, t_actual) neighbours = k_nearest_finder.find_k_nearest_neighbours(search_for, t, -1) result.update(get_interpolation_results(neighbours, t_actual)) print("actual measurement:", t_actual) for neighbour in neighbours: temperature, distance = neighbour print("measured", temperature, "°C in", distance, "meters distance") items = list(result.items()) items.sort(key=lambda el: el[0]) for method, value in items: print("method", method, "value", value**.5)
def run(): start_date = "2016-01-01T00:00" end_date = "2016-12-31T23:59" eddh_df = load_eddh(start_date, end_date) station_repository = StationRepository(*get_repository_parameters( RepositoryParameter.ONLY_OUTDOOR_AND_SHADED)) station_dicts = station_repository.load_all_stations( start_date, end_date, # limit=5, # for testing purposes limit_to_temperature=False) random.shuffle(station_dicts) split_point = int(len(station_dicts) * .7) training_dicts, evaluation_dicts = station_dicts[: split_point], station_dicts[ split_point:] logging.info("training stations: %s" % [station["name"] for station in training_dicts]) logging.info("evaluation stations: %s" % [station["name"] for station in evaluation_dicts]) training_csv_file = os.path.join(PROCESSED_DATA_DIR, "neural_networks", "training_data_filtered.csv") join_to_big_vector(training_csv_file, training_dicts, eddh_df) evaluation_csv_file = os.path.join(PROCESSED_DATA_DIR, "neural_networks", "evaluation_data_filtered.csv") join_to_big_vector(evaluation_csv_file, evaluation_dicts, eddh_df)
def get_station_dicts(start_date, end_date): # repository_parameter = RepositoryParameter.START repository_parameter = RepositoryParameter.ONLY_OUTDOOR_AND_SHADED station_repository = StationRepository( *get_repository_parameters(repository_parameter)) station_dicts = station_repository.load_all_stations(start_date, end_date) return station_dicts
def score_algorithm(start_date, end_date, repository_parameters, limit=0, interpolation_name="NONE"): station_repository = CrowdsoucingStationRepository(*repository_parameters) station_dicts = station_repository.load_all_stations( start_date, end_date, limit=limit ) random.shuffle(station_dicts) neighbour_station_dicts = station_dicts[:int(.7 * len(station_dicts))] # only use 70% target_station_dicts = HusconetStationRepository().load_all_stations( start_date, end_date, limit=limit ) setup_logger(interpolation_name) logging.info("General Overview") logging.info("targets: " + " ".join([station_dict["name"] for station_dict in target_station_dicts])) logging.info("neighbours: " + " ".join([station_dict["name"] for station_dict in neighbour_station_dicts])) logging.info("End overview") logging.info("Several Runs") target_station_dicts_len = str(len(target_station_dicts)) overall_result = itertools.starmap(do_interpolation_scoring, [ [ target_station_dict, j, target_station_dicts_len, neighbour_station_dicts, start_date, end_date ] for j, target_station_dict in enumerate(target_station_dicts) ]) logging.info("end targets") logging.info("overall result") overall_result_df = pandas.concat(overall_result) column_names = overall_result_df.columns.values.tolist() methods = set() for column_name in column_names: method, value = column_name.split("--") methods.update([method]) for method in methods: overall_total = numpy.nansum(overall_result_df[method + "--total"]) overall_n = int(numpy.nansum(overall_result_df[method + "--n"])) overall_rmse = numpy.sqrt(overall_total / overall_n) score_str = "%.3f" % overall_rmse logging.info(method + " " * (12 - len(method)) + score_str + " n=" + str(overall_n)) logging.info("end overall result") overall_result_df.to_csv("interpolation_result_husconet_median_5_{date}_{interpolation_name}.csv".format( date=datetime.datetime.now().isoformat().replace(":", "-").replace(".", "-"), interpolation_name=interpolation_name ))
def gather_statistics(repository_parameter, start_date, end_date): logging.info("repository: %s" % repository_parameter.value) station_repository = StationRepository( *get_repository_parameters(repository_parameter)) availabilities = [] station_dicts = station_repository.load_all_stations(start_date=start_date, end_date=end_date) logging.info("total: %i" % len(station_dicts)) while True: if len(station_dicts) == 0: break station_dict = station_dicts.pop() position = station_dict["meta_data"]["position"] station_dict["data_frame"] = sample_up(station_dict["data_frame"], start_date, end_date) row_result = { "station_name": station_dict["name"], "lat": position["lat"], "lon": position["lon"], "available_data": get_available_data(station_dict) } availabilities.append(row_result) logging.debug("{station_name}: {lat} {lon} -- {available_data}".format( **row_result)) df = pandas.DataFrame(availabilities) result_file = os.path.join( os.path.dirname(os.path.realpath(__file__)), "log", "calculate_available_data_%s.csv" % repository_parameter.value) df.to_csv(result_file)
def plot_station(station, start_date, end_date): """ Plots measured values in the foreground and the average of all HUSCONET weather stations in the background. :param station: The station name which station should be plotted :param start_date: The start date of the plot :type start_date: str | datetime.datetime :param end_date: The end date of the plot :type end_date: str | datetime.datetime """ station_repository = StationRepository() station_dict = station_repository.load_station(station, start_date, end_date, GermanWinterTime()) station_df = station_dict['data_frame'] station_df = insert_nans(station_df) pyplot.plot(station_df.index, station_df.temperature, label=station) logging.debug("plotting {station} from {start} to {end}" .format(station=station, start=station_df.index.min(), end=station_df.index.max())) husconet_station_df = load_husconet_temperature_average(start_date, end_date) pyplot.plot(husconet_station_df.index, husconet_station_df.temperature, alpha=0.3, label="Referenznetzwerk") logging.debug("plotting HUSCONET from {start} to {end}" .format(start=station_df.index.min(), end=station_df.index.max())) style_year_2016_plot(pyplot.gca()) pyplot.legend() pyplot.show()
def plot_stations(data, start_date, end_date, time_zone=None, limit=0): """ """ plot_df = pandas.DataFrame() fig = pyplot.figure() fig.canvas.set_window_title("vor_und_nach_dem_filtern") for title, weather_station, summary_dir in data: station_repository = StationRepository(weather_station, summary_dir) station_dicts = station_repository.load_all_stations( start_date, end_date, time_zone=time_zone, limit=limit) temperatures = [ station_dict['data_frame'].temperature for station_dict in station_dicts ] plot_df[title] = pandas.concat(temperatures, ignore_index=True) logging.debug("start plotting") ax = seaborn.boxplot(data=plot_df, width=.5) ax.set(ylabel="Temperatur (°C)") ax.yaxis.set_major_locator( mticker.MultipleLocator(10)) # draw line every 10 °C pyplot.grid(color='.8') # a very light gray pyplot.show()
def gather_statistics(private_weather_stations_file_name): station_repository = StationRepository(private_weather_stations_file_name) software_types = [] stations_df = station_repository.get_all_stations() logging.info("total: %i" % len(stations_df)) for station in stations_df.index: software_types.append(get_software_type(station)) for software_type, count in collections.Counter(software_types).items(): logging.info(" %s : %i" % (software_type, count))
def plot_station(title, weather_stations, summary_dir, start_date, end_date): """ Plots measured values in the foreground and the average of all HUSCONET weather stations in the background. :param title: The window title :type title: str :param weather_stations: path to file with list of weather stations :type weather_stations: str :param summary_dir: directory with all necessary summaries, possibly pre-filtered :type summary_dir: str :param start_date: The start date of the plot :type start_date: str | datetime.datetime :param end_date: The end date of the plot :type end_date: str | datetime.datetime """ fig = pyplot.figure() fig.canvas.set_window_title(title) pyplot.rcParams['savefig.dpi'] = 300 station_repository = StationRepository(weather_stations, summary_dir) station_dicts = station_repository.load_all_stations( start_date, end_date, # limit=10 # for testing purposes ) for station_dict in station_dicts: logging.debug("prepare plotting " + station_dict["name"]) station_df = station_dict['data_frame'] station_df = insert_nans(station_df) # discontinue line if gap is too big pyplot.plot(station_df.index, station_df.temperature, linewidth=.4, color='gray', alpha=.8) logging.debug("load husconet") husconet_station_df = load_husconet_temperature_average(start_date, end_date) logging.debug("start plotting") pyplot.plot(husconet_station_df.index, husconet_station_df.temperature, color="blue", linewidth=.4, label="Referenznetzwerk") # upper_line = (husconet_station_df.temperature + husconet_station_df.temperature_std * 3) # ax = upper_line.plot(color="green", alpha=0.4, label="avg(HUSCONET) + 3 $\sigma$(HUSCONET)") ax = pyplot.gca() style_year_2016_plot(ax) logging.debug("show plot") gray_line = mlines.Line2D([], [], color='gray', label="private Wetterstationen") # only one entry for many blue_line = mlines.Line2D([], [], color="blue", label="Referenznetzwerk") # proper line width to see color ax.legend( [blue_line, gray_line], [blue_line.get_label(), gray_line.get_label()], loc='best' ) pyplot.show()
def demo(): start_date = '2016-01-01T00:00' end_date = '2016-03-31T23:59' from filter_weather_data.filters import StationRepository station_repository = StationRepository() station_dicts = station_repository.load_all_stations(start_date, end_date, limit=20) meta_data_df = station_repository.get_all_stations() chosen_index = 9 search_for = station_dicts[chosen_index] print("picked", station_dicts[chosen_index]["name"], "to look for") draw_map([(lat, lon, label) for label, (lat, lon) in meta_data_df.iterrows()]) del station_dicts[chosen_index] # otherwise triangles will contain the searched point as well. delaunay_triangulation = DelaunayTriangulator(station_dicts, start_date, end_date) neighbours = delaunay_triangulation.find_delaunay_neighbours(search_for, "2016-02-05T03:01") for neighbour in neighbours: temperature, distance = neighbour print("measured", temperature, "°C in", distance, "meters distance")
def plot_station(station, start_date, end_date): """ Plots the regression of the temperature difference on solar radiation :param station: The station name which station should be plotted :param start_date: The start date of the plot :param end_date: The end date of the plot """ summary_dir = os.path.join( PROCESSED_DATA_DIR, "filtered_station_summaries_frequent" ) outdoor_stations = os.path.join( PROCESSED_DATA_DIR, "filtered_stations", "station_dicts_outdoor.csv" ) station_repository = StationRepository(outdoor_stations, summary_dir) station_dict = station_repository.load_station(station, start_date, end_date) station_df = station_dict["data_frame"] reference_temperature_df = load_husconet_temperature_average(start_date, end_date) reference_radiation_df = load_husconet_radiation_average(start_date, end_date) temp_df = station_df.join(reference_temperature_df, how='inner', rsuffix="_reference_temperature") delta_temperature = (temp_df.temperature - temp_df.temperature_reference_temperature).rename("temperature_delta") delta_df = pandas.concat([temp_df, delta_temperature], axis=1) delta_df = delta_df.join(reference_radiation_df, how='left') df_only_sunshine = delta_df[(delta_df.radiation > SUNSHINE_MINIMUM_THRESHOLD)] df_only_sunshine = df_only_sunshine.dropna(axis=0, how='any') X = df_only_sunshine.temperature_delta Y = df_only_sunshine.temperature_reference_temperature fig = pyplot.figure() fig.canvas.set_window_title(station + "temperature regressed on radiation") pyplot.scatter(X, Y, marker="x", color="gray") pyplot.plot(X, numpy.poly1d(numpy.polyfit(X, Y, 1))(X), color="gray", alpha=.8, label=station) pyplot.gca().set_ylabel(r'Globalstrahlung ($\frac{W}{m^2}$)') pyplot.gca().set_xlabel('Temperaturdifferenz Crowdsourced - Referenznetzwerk (°C)') pyplot.show()
def run(testing=False): start_date = "2016-01-01T00:00" end_date = "2016-12-31T23:59" if not testing else "2016-03-31" eddh_df = load_eddh(start_date, end_date) station_repository = StationRepository( *get_repository_parameters(RepositoryParameter.START_FULL_SENSOR)) station_dicts = station_repository.load_all_stations( start_date, end_date, limit_to_temperature=False, limit=0 if not testing else 15 # for testing purposes ) husconet_dicts = HusconetStationRepository().load_all_stations( start_date, end_date, limit=0 if not testing else 3 # for testing purposes ) random.shuffle(husconet_dicts) split_point = int(len(husconet_dicts) * .7) training_dicts, evaluation_dicts = husconet_dicts[: split_point], husconet_dicts[ split_point:] logging.info("training stations: %s" % [station["name"] for station in training_dicts]) logging.info("evaluation stations: %s" % [station["name"] for station in evaluation_dicts]) logging.debug("prepare evaluation") evaluation_csv_file = os.path.join(PROCESSED_DATA_DIR, "neural_networks", "evaluation_data_husconet.csv") join_to_big_vector(evaluation_csv_file, station_dicts[:], evaluation_dicts, eddh_df) logging.debug("prepare training") training_csv_file = os.path.join(PROCESSED_DATA_DIR, "neural_networks", "training_data_husconet.csv") join_to_big_vector(training_csv_file, station_dicts, training_dicts, eddh_df) logging.debug("done")
def demo(): from filter_weather_data.filters import StationRepository from gather_weather_data.husconet import GermanWinterTime date = '2016-12-01T00:00' start_date = date # first value to load end_date = date # load until this date (plus some margin) t = date # check the values at this given time bins_per_step = 10000 station_repository = StationRepository() station_dicts = station_repository.load_all_stations( start_date, end_date, time_zone=GermanWinterTime(), # limit=300 ) for station_dict in station_dicts: station_dict["data_frame"] = sample_up(station_dict["data_frame"], start_date, end_date, 30) # 30 minutes decay margin = 0.01 values = grid_data(margin, station_dicts, t, bins_per_step) plot(*values)
def run_clustering(repository_parameter_name, start_date, end_date, limit): """ :param repository_parameter_name: One of the types from ``RepositoryParameter`` :param start_date: First day :param end_date: Last day :param limit: Limit the number of examined stations :return: Show clustering """ params = get_repository_parameters(repository_parameter_name) station_repository = StationRepository(*params) station_dicts = station_repository.load_all_stations(start_date, end_date, limit=limit) station_time_series_comparator = StationTimeSeriesComparator(station_dicts) stations = [Station(station_dict) for station_dict in station_dicts] cluster = HierarchicalClustering( stations, station_time_series_comparator.compare_time_series, num_processes=4) cluster.cluster() cluster.display(print_function=logging.debug) logging.info(cluster._data)
def plot_station(title, weather_stations, summary_dir, start_date, end_date): """ Plots measured values in the foreground and the average of all HUSCONET weather stations in the background. :param title: The window title :type title: str :param weather_stations: path to file with list of weather stations :type weather_stations: str :param summary_dir: directory with all necessary summaries, possibly pre-filtered :type summary_dir: str :param start_date: The start date of the plot :type start_date: str | datetime.datetime :param end_date: The end date of the plot :type end_date: str | datetime.datetime """ station_repository = StationRepository(weather_stations, summary_dir) station_dicts = station_repository.load_all_stations( start_date, end_date, limit=10, # for testing the design limit_to_temperature=False) dwd_station_df = load_dwd_precipitation(start_date, end_date) monthly_dwd_df = dwd_station_df.groupby(pandas.TimeGrouper("M")).sum() figure = pyplot.figure() figure.canvas.set_window_title(title) axis_2 = figure.add_subplot(111) axis_2.yaxis.tick_right() axis_2.yaxis.set_label_position("right") axis_2.set_ylabel("Niederschlag (mm/Tag)") axis_2.plot(dwd_station_df.index, dwd_station_df.precipitation, label="DWD Tageswerte", alpha=.8) axis_2.fill_between(dwd_station_df.index, dwd_station_df.precipitation, facecolors="b", interpolate=True, alpha=.8) _, max_precipitation = axis_2.get_ylim() axis_2.set_ylim((0, max_precipitation)) axis_1 = figure.add_subplot(111, sharex=axis_2, frameon=False) for station_dict in station_dicts: logging.debug("prepare plotting " + station_dict["name"]) station_df = station_dict['data_frame'] station_df = insert_nans( station_df) # discontinue line if gap is too big station_df = clean_data(station_df, monthly_dwd_df) axis_1.plot(station_df.index, station_df.precipitation, ".", alpha=.6) axis_1.set_xlabel('2016') axis_1.xaxis.set_major_locator(mdates.MonthLocator()) axis_1.xaxis.set_major_formatter(mdates.DateFormatter('%m')) axis_1.set_ylabel('Niederschlag (mm/Stunde)') _, max_precipitation = axis_1.get_ylim() axis_1.set_ylim((0, max_precipitation)) axis_1.margins(x=0) # remove margins for both axes blue_patch = mpatches.Patch(color='blue', label='DWD Tageswerte', alpha=.8) grey_dot = mlines.Line2D([], [], color='grey', marker='.', linestyle=" ", label='private Wetterstation Stundenwerte') pyplot.legend(handles=[blue_patch, grey_dot]) pyplot.show()
def gather_statistics(repository_parameter, start_date, end_date): logging.info("repository: %s" % repository_parameter.value) station_repository = StationRepository( *get_repository_parameters(repository_parameter)) available_precipitation = {} available_wind = {} station_dicts = station_repository.load_all_stations( start_date=start_date, end_date=end_date, limit_to_temperature=False, # limit=10 # for testing purposes ) logging.info("total: %i" % len(station_dicts)) stations_with_precipitation = set() stations_with_wind = set() while True: if len(station_dicts) == 0: break station_dict = station_dicts.pop() # free memory whenever you can precipitation = get_available_precipitation(station_dict) if len(precipitation): available_precipitation[station_dict["name"]] = precipitation stations_with_precipitation.add(station_dict["name"]) wind = get_available_wind(station_dict) if len(wind): available_wind[station_dict["name"]] = wind stations_with_wind.add(station_dict["name"]) df_precipitation = pandas.DataFrame(available_precipitation) df_wind = pandas.DataFrame(available_wind) result_file_precipitation = os.path.join( PROCESSED_DATA_DIR, "misc", "precipitation_per_month_%s.csv" % repository_parameter.value) df_precipitation.to_csv(result_file_precipitation) result_file_wind = os.path.join( PROCESSED_DATA_DIR, "misc", "wind_per_month_%s.csv" % repository_parameter.value) df_wind.to_csv(result_file_wind) station_dicts_wind = os.path.join(PROCESSED_DATA_DIR, "filtered_stations", "station_dicts_wind.csv") df_data = [] for station_with_wind in stations_with_wind: meta_info = station_repository.get_meta_info(station_with_wind) df_data.append({ "station": station_with_wind, "lat": meta_info.lat, "lon": meta_info.lon }) df = pandas.DataFrame(df_data) df.set_index("station", inplace=True) df.to_csv(station_dicts_wind) station_dicts_precipitation = os.path.join( PROCESSED_DATA_DIR, "filtered_stations", "station_dicts_precipitation.csv") df_data = [] for station_with_precipitation in stations_with_precipitation: meta_info = station_repository.get_meta_info( station_with_precipitation) df_data.append({ "station": station_with_precipitation, "lat": meta_info.lat, "lon": meta_info.lon, }) df = pandas.DataFrame(df_data) df.set_index("station", inplace=True) df.to_csv(station_dicts_precipitation)