Example #1
0
def demo():
    start_date = '2016-01-01T00:00'
    end_date = '2016-03-31T23:59'
    from filter_weather_data.filters import StationRepository
    from .nearest_k_finder import NearestKFinder
    station_repository = StationRepository()
    station_dicts = station_repository.load_all_stations(start_date,
                                                         end_date,
                                                         limit=20)
    chosen_index = 9
    search_for = station_dicts[chosen_index]
    print("picked", station_dicts[chosen_index]["name"], "to look for")
    del station_dicts[
        chosen_index]  # otherwise triangles will contain the searched point as well.
    k_nearest_finder = NearestKFinder(station_dicts, start_date, end_date)
    t = search_for["data_frame"].index.values[0]
    neighbours = k_nearest_finder.find_k_nearest_neighbours(search_for, t, 3)
    t_actual = search_for["data_frame"].loc[t].temperature
    result = get_interpolation_results(neighbours, t_actual)
    neighbours = k_nearest_finder.find_k_nearest_neighbours(search_for, t, -1)
    result.update(get_interpolation_results(neighbours, t_actual))
    print("actual measurement:", t_actual)
    for neighbour in neighbours:
        temperature, distance = neighbour
        print("measured", temperature, "°C in", distance, "meters distance")
    items = list(result.items())
    items.sort(key=lambda el: el[0])
    for method, value in items:
        print("method", method, "value", value**.5)
Example #2
0
def run():
    start_date = "2016-01-01T00:00"
    end_date = "2016-12-31T23:59"

    eddh_df = load_eddh(start_date, end_date)
    station_repository = StationRepository(*get_repository_parameters(
        RepositoryParameter.ONLY_OUTDOOR_AND_SHADED))
    station_dicts = station_repository.load_all_stations(
        start_date,
        end_date,
        # limit=5,  # for testing purposes
        limit_to_temperature=False)

    random.shuffle(station_dicts)
    split_point = int(len(station_dicts) * .7)
    training_dicts, evaluation_dicts = station_dicts[:
                                                     split_point], station_dicts[
                                                         split_point:]

    logging.info("training stations: %s" %
                 [station["name"] for station in training_dicts])
    logging.info("evaluation stations: %s" %
                 [station["name"] for station in evaluation_dicts])

    training_csv_file = os.path.join(PROCESSED_DATA_DIR, "neural_networks",
                                     "training_data_filtered.csv")
    join_to_big_vector(training_csv_file, training_dicts, eddh_df)

    evaluation_csv_file = os.path.join(PROCESSED_DATA_DIR, "neural_networks",
                                       "evaluation_data_filtered.csv")
    join_to_big_vector(evaluation_csv_file, evaluation_dicts, eddh_df)
def get_station_dicts(start_date, end_date):
    # repository_parameter = RepositoryParameter.START
    repository_parameter = RepositoryParameter.ONLY_OUTDOOR_AND_SHADED
    station_repository = StationRepository(
        *get_repository_parameters(repository_parameter))
    station_dicts = station_repository.load_all_stations(start_date, end_date)
    return station_dicts
def score_algorithm(start_date, end_date, repository_parameters, limit=0, interpolation_name="NONE"):
    station_repository = CrowdsoucingStationRepository(*repository_parameters)

    station_dicts = station_repository.load_all_stations(
        start_date,
        end_date,
        limit=limit
    )

    random.shuffle(station_dicts)
    neighbour_station_dicts = station_dicts[:int(.7 * len(station_dicts))]  # only use 70%

    target_station_dicts = HusconetStationRepository().load_all_stations(
        start_date,
        end_date,
        limit=limit
    )

    setup_logger(interpolation_name)
    logging.info("General Overview")
    logging.info("targets: " + " ".join([station_dict["name"] for station_dict in target_station_dicts]))
    logging.info("neighbours: " + " ".join([station_dict["name"] for station_dict in neighbour_station_dicts]))
    logging.info("End overview")

    logging.info("Several Runs")
    target_station_dicts_len = str(len(target_station_dicts))

    overall_result = itertools.starmap(do_interpolation_scoring, [
        [
            target_station_dict,
            j,
            target_station_dicts_len,
            neighbour_station_dicts,
            start_date,
            end_date
        ] for j, target_station_dict in enumerate(target_station_dicts)
    ])

    logging.info("end targets")

    logging.info("overall result")
    overall_result_df = pandas.concat(overall_result)
    column_names = overall_result_df.columns.values.tolist()
    methods = set()
    for column_name in column_names:
        method, value = column_name.split("--")
        methods.update([method])
    for method in methods:
        overall_total = numpy.nansum(overall_result_df[method + "--total"])
        overall_n = int(numpy.nansum(overall_result_df[method + "--n"]))
        overall_rmse = numpy.sqrt(overall_total / overall_n)
        score_str = "%.3f" % overall_rmse
        logging.info(method + " " * (12 - len(method)) + score_str + " n=" + str(overall_n))

    logging.info("end overall result")

    overall_result_df.to_csv("interpolation_result_husconet_median_5_{date}_{interpolation_name}.csv".format(
        date=datetime.datetime.now().isoformat().replace(":", "-").replace(".", "-"),
        interpolation_name=interpolation_name
    ))
def gather_statistics(repository_parameter, start_date, end_date):
    logging.info("repository: %s" % repository_parameter.value)
    station_repository = StationRepository(
        *get_repository_parameters(repository_parameter))
    availabilities = []
    station_dicts = station_repository.load_all_stations(start_date=start_date,
                                                         end_date=end_date)
    logging.info("total: %i" % len(station_dicts))
    while True:
        if len(station_dicts) == 0:
            break
        station_dict = station_dicts.pop()
        position = station_dict["meta_data"]["position"]
        station_dict["data_frame"] = sample_up(station_dict["data_frame"],
                                               start_date, end_date)
        row_result = {
            "station_name": station_dict["name"],
            "lat": position["lat"],
            "lon": position["lon"],
            "available_data": get_available_data(station_dict)
        }
        availabilities.append(row_result)
        logging.debug("{station_name}: {lat} {lon} -- {available_data}".format(
            **row_result))
    df = pandas.DataFrame(availabilities)
    result_file = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "log",
        "calculate_available_data_%s.csv" % repository_parameter.value)
    df.to_csv(result_file)
def plot_station(station, start_date, end_date):
    """
    Plots measured values in the foreground and the average of all HUSCONET weather stations in the background.
    
    :param station: The station name which station should be plotted
    :param start_date: The start date of the plot
    :type start_date: str | datetime.datetime
    :param end_date: The end date of the plot
    :type end_date: str | datetime.datetime
    """
    station_repository = StationRepository()
    station_dict = station_repository.load_station(station, start_date, end_date, GermanWinterTime())
    station_df = station_dict['data_frame']
    station_df = insert_nans(station_df)
    pyplot.plot(station_df.index, station_df.temperature, label=station)
    logging.debug("plotting {station} from {start} to {end}"
                  .format(station=station, start=station_df.index.min(), end=station_df.index.max()))

    husconet_station_df = load_husconet_temperature_average(start_date, end_date)
    pyplot.plot(husconet_station_df.index, husconet_station_df.temperature, alpha=0.3, label="Referenznetzwerk")
    logging.debug("plotting HUSCONET from {start} to {end}"
                  .format(start=station_df.index.min(), end=station_df.index.max()))

    style_year_2016_plot(pyplot.gca())
    pyplot.legend()
    pyplot.show()
Example #7
0
def plot_stations(data, start_date, end_date, time_zone=None, limit=0):
    """
    """
    plot_df = pandas.DataFrame()

    fig = pyplot.figure()
    fig.canvas.set_window_title("vor_und_nach_dem_filtern")

    for title, weather_station, summary_dir in data:
        station_repository = StationRepository(weather_station, summary_dir)
        station_dicts = station_repository.load_all_stations(
            start_date, end_date, time_zone=time_zone, limit=limit)
        temperatures = [
            station_dict['data_frame'].temperature
            for station_dict in station_dicts
        ]
        plot_df[title] = pandas.concat(temperatures, ignore_index=True)

    logging.debug("start plotting")
    ax = seaborn.boxplot(data=plot_df, width=.5)
    ax.set(ylabel="Temperatur (°C)")
    ax.yaxis.set_major_locator(
        mticker.MultipleLocator(10))  # draw line every 10 °C
    pyplot.grid(color='.8')  # a very light gray
    pyplot.show()
def gather_statistics(private_weather_stations_file_name):
    station_repository = StationRepository(private_weather_stations_file_name)
    software_types = []
    stations_df = station_repository.get_all_stations()
    logging.info("total: %i" % len(stations_df))
    for station in stations_df.index:
        software_types.append(get_software_type(station))
    for software_type, count in collections.Counter(software_types).items():
        logging.info("  %s : %i" % (software_type, count))
def plot_station(title, weather_stations, summary_dir, start_date, end_date):
    """
    Plots measured values in the foreground and the average of all HUSCONET weather stations in the background.

    :param title: The window title
    :type title: str
    :param weather_stations: path to file with list of weather stations
    :type weather_stations: str
    :param summary_dir: directory with all necessary summaries, possibly pre-filtered
    :type summary_dir: str
    :param start_date: The start date of the plot
    :type start_date: str | datetime.datetime
    :param end_date: The end date of the plot
    :type end_date: str | datetime.datetime
    """

    fig = pyplot.figure()
    fig.canvas.set_window_title(title)
    pyplot.rcParams['savefig.dpi'] = 300

    station_repository = StationRepository(weather_stations, summary_dir)
    station_dicts = station_repository.load_all_stations(
        start_date,
        end_date,
        # limit=10  # for testing purposes
    )
    for station_dict in station_dicts:
        logging.debug("prepare plotting " + station_dict["name"])
        station_df = station_dict['data_frame']
        station_df = insert_nans(station_df)  # discontinue line if gap is too big
        pyplot.plot(station_df.index, station_df.temperature, linewidth=.4, color='gray', alpha=.8)

    logging.debug("load husconet")
    husconet_station_df = load_husconet_temperature_average(start_date, end_date)

    logging.debug("start plotting")
    pyplot.plot(husconet_station_df.index, husconet_station_df.temperature, color="blue", linewidth=.4,
                label="Referenznetzwerk")
    # upper_line = (husconet_station_df.temperature + husconet_station_df.temperature_std * 3)
    # ax = upper_line.plot(color="green", alpha=0.4, label="avg(HUSCONET) + 3 $\sigma$(HUSCONET)")

    ax = pyplot.gca()
    style_year_2016_plot(ax)

    logging.debug("show plot")
    gray_line = mlines.Line2D([], [], color='gray', label="private Wetterstationen")  # only one entry for many
    blue_line = mlines.Line2D([], [], color="blue", label="Referenznetzwerk")  # proper line width to see color
    ax.legend(
        [blue_line, gray_line],
        [blue_line.get_label(), gray_line.get_label()],
        loc='best'
    )
    pyplot.show()
def demo():
    start_date = '2016-01-01T00:00'
    end_date = '2016-03-31T23:59'
    from filter_weather_data.filters import StationRepository
    station_repository = StationRepository()
    station_dicts = station_repository.load_all_stations(start_date, end_date, limit=20)
    meta_data_df = station_repository.get_all_stations()
    chosen_index = 9
    search_for = station_dicts[chosen_index]
    print("picked", station_dicts[chosen_index]["name"], "to look for")
    draw_map([(lat, lon, label)
              for label, (lat, lon) in meta_data_df.iterrows()])
    del station_dicts[chosen_index]  # otherwise triangles will contain the searched point as well.
    delaunay_triangulation = DelaunayTriangulator(station_dicts, start_date, end_date)
    neighbours = delaunay_triangulation.find_delaunay_neighbours(search_for, "2016-02-05T03:01")
    for neighbour in neighbours:
        temperature, distance = neighbour
        print("measured", temperature, "°C in", distance, "meters distance")
def plot_station(station, start_date, end_date):
    """
    Plots the regression of the temperature difference on solar radiation

    :param station: The station name which station should be plotted
    :param start_date: The start date of the plot
    :param end_date: The end date of the plot
    """
    summary_dir = os.path.join(
        PROCESSED_DATA_DIR,
        "filtered_station_summaries_frequent"
    )
    outdoor_stations = os.path.join(
        PROCESSED_DATA_DIR,
        "filtered_stations",
        "station_dicts_outdoor.csv"
    )
    station_repository = StationRepository(outdoor_stations, summary_dir)
    station_dict = station_repository.load_station(station, start_date, end_date)
    station_df = station_dict["data_frame"]

    reference_temperature_df = load_husconet_temperature_average(start_date, end_date)
    reference_radiation_df = load_husconet_radiation_average(start_date, end_date)

    temp_df = station_df.join(reference_temperature_df, how='inner', rsuffix="_reference_temperature")
    delta_temperature = (temp_df.temperature - temp_df.temperature_reference_temperature).rename("temperature_delta")
    delta_df = pandas.concat([temp_df, delta_temperature], axis=1)

    delta_df = delta_df.join(reference_radiation_df, how='left')
    df_only_sunshine = delta_df[(delta_df.radiation > SUNSHINE_MINIMUM_THRESHOLD)]
    df_only_sunshine = df_only_sunshine.dropna(axis=0, how='any')

    X = df_only_sunshine.temperature_delta
    Y = df_only_sunshine.temperature_reference_temperature

    fig = pyplot.figure()
    fig.canvas.set_window_title(station + "temperature regressed on radiation")

    pyplot.scatter(X, Y, marker="x", color="gray")
    pyplot.plot(X, numpy.poly1d(numpy.polyfit(X, Y, 1))(X), color="gray", alpha=.8, label=station)
    pyplot.gca().set_ylabel(r'Globalstrahlung ($\frac{W}{m^2}$)')
    pyplot.gca().set_xlabel('Temperaturdifferenz Crowdsourced - Referenznetzwerk (°C)')
    pyplot.show()
Example #12
0
def run(testing=False):
    start_date = "2016-01-01T00:00"
    end_date = "2016-12-31T23:59" if not testing else "2016-03-31"

    eddh_df = load_eddh(start_date, end_date)
    station_repository = StationRepository(
        *get_repository_parameters(RepositoryParameter.START_FULL_SENSOR))
    station_dicts = station_repository.load_all_stations(
        start_date,
        end_date,
        limit_to_temperature=False,
        limit=0 if not testing else 15  # for testing purposes
    )

    husconet_dicts = HusconetStationRepository().load_all_stations(
        start_date,
        end_date,
        limit=0 if not testing else 3  # for testing purposes
    )
    random.shuffle(husconet_dicts)
    split_point = int(len(husconet_dicts) * .7)
    training_dicts, evaluation_dicts = husconet_dicts[:
                                                      split_point], husconet_dicts[
                                                          split_point:]
    logging.info("training stations: %s" %
                 [station["name"] for station in training_dicts])
    logging.info("evaluation stations: %s" %
                 [station["name"] for station in evaluation_dicts])

    logging.debug("prepare evaluation")
    evaluation_csv_file = os.path.join(PROCESSED_DATA_DIR, "neural_networks",
                                       "evaluation_data_husconet.csv")
    join_to_big_vector(evaluation_csv_file, station_dicts[:], evaluation_dicts,
                       eddh_df)

    logging.debug("prepare training")
    training_csv_file = os.path.join(PROCESSED_DATA_DIR, "neural_networks",
                                     "training_data_husconet.csv")
    join_to_big_vector(training_csv_file, station_dicts, training_dicts,
                       eddh_df)
    logging.debug("done")
Example #13
0
def demo():
    from filter_weather_data.filters import StationRepository
    from gather_weather_data.husconet import GermanWinterTime
    date = '2016-12-01T00:00'
    start_date = date  # first value to load
    end_date = date  # load until this date (plus some margin)
    t = date  # check the values at this given time
    bins_per_step = 10000
    station_repository = StationRepository()
    station_dicts = station_repository.load_all_stations(
        start_date,
        end_date,
        time_zone=GermanWinterTime(),
        # limit=300
    )
    for station_dict in station_dicts:
        station_dict["data_frame"] = sample_up(station_dict["data_frame"],
                                               start_date, end_date,
                                               30)  # 30 minutes decay
    margin = 0.01
    values = grid_data(margin, station_dicts, t, bins_per_step)
    plot(*values)
def run_clustering(repository_parameter_name, start_date, end_date, limit):
    """

    :param repository_parameter_name: One of the types from ``RepositoryParameter``
    :param start_date: First day
    :param end_date: Last day
    :param limit: Limit the number of examined stations
    :return: Show clustering
    """
    params = get_repository_parameters(repository_parameter_name)
    station_repository = StationRepository(*params)
    station_dicts = station_repository.load_all_stations(start_date,
                                                         end_date,
                                                         limit=limit)
    station_time_series_comparator = StationTimeSeriesComparator(station_dicts)
    stations = [Station(station_dict) for station_dict in station_dicts]

    cluster = HierarchicalClustering(
        stations,
        station_time_series_comparator.compare_time_series,
        num_processes=4)
    cluster.cluster()
    cluster.display(print_function=logging.debug)
    logging.info(cluster._data)
def plot_station(title, weather_stations, summary_dir, start_date, end_date):
    """
    Plots measured values in the foreground and the average of all HUSCONET weather stations in the background.

    :param title: The window title
    :type title: str
    :param weather_stations: path to file with list of weather stations
    :type weather_stations: str
    :param summary_dir: directory with all necessary summaries, possibly pre-filtered
    :type summary_dir: str
    :param start_date: The start date of the plot
    :type start_date: str | datetime.datetime
    :param end_date: The end date of the plot
    :type end_date: str | datetime.datetime
    """

    station_repository = StationRepository(weather_stations, summary_dir)
    station_dicts = station_repository.load_all_stations(
        start_date,
        end_date,
        limit=10,  # for testing the design
        limit_to_temperature=False)

    dwd_station_df = load_dwd_precipitation(start_date, end_date)
    monthly_dwd_df = dwd_station_df.groupby(pandas.TimeGrouper("M")).sum()

    figure = pyplot.figure()
    figure.canvas.set_window_title(title)

    axis_2 = figure.add_subplot(111)
    axis_2.yaxis.tick_right()
    axis_2.yaxis.set_label_position("right")
    axis_2.set_ylabel("Niederschlag (mm/Tag)")
    axis_2.plot(dwd_station_df.index,
                dwd_station_df.precipitation,
                label="DWD Tageswerte",
                alpha=.8)
    axis_2.fill_between(dwd_station_df.index,
                        dwd_station_df.precipitation,
                        facecolors="b",
                        interpolate=True,
                        alpha=.8)
    _, max_precipitation = axis_2.get_ylim()
    axis_2.set_ylim((0, max_precipitation))

    axis_1 = figure.add_subplot(111, sharex=axis_2, frameon=False)
    for station_dict in station_dicts:
        logging.debug("prepare plotting " + station_dict["name"])
        station_df = station_dict['data_frame']
        station_df = insert_nans(
            station_df)  # discontinue line if gap is too big
        station_df = clean_data(station_df, monthly_dwd_df)
        axis_1.plot(station_df.index, station_df.precipitation, ".", alpha=.6)

    axis_1.set_xlabel('2016')
    axis_1.xaxis.set_major_locator(mdates.MonthLocator())
    axis_1.xaxis.set_major_formatter(mdates.DateFormatter('%m'))
    axis_1.set_ylabel('Niederschlag (mm/Stunde)')
    _, max_precipitation = axis_1.get_ylim()
    axis_1.set_ylim((0, max_precipitation))

    axis_1.margins(x=0)  # remove margins for both axes

    blue_patch = mpatches.Patch(color='blue', label='DWD Tageswerte', alpha=.8)
    grey_dot = mlines.Line2D([], [],
                             color='grey',
                             marker='.',
                             linestyle=" ",
                             label='private Wetterstation Stundenwerte')
    pyplot.legend(handles=[blue_patch, grey_dot])

    pyplot.show()
def gather_statistics(repository_parameter, start_date, end_date):
    logging.info("repository: %s" % repository_parameter.value)
    station_repository = StationRepository(
        *get_repository_parameters(repository_parameter))
    available_precipitation = {}
    available_wind = {}
    station_dicts = station_repository.load_all_stations(
        start_date=start_date,
        end_date=end_date,
        limit_to_temperature=False,
        # limit=10  # for testing purposes
    )
    logging.info("total: %i" % len(station_dicts))
    stations_with_precipitation = set()
    stations_with_wind = set()
    while True:
        if len(station_dicts) == 0:
            break
        station_dict = station_dicts.pop()  # free memory whenever you can
        precipitation = get_available_precipitation(station_dict)
        if len(precipitation):
            available_precipitation[station_dict["name"]] = precipitation
            stations_with_precipitation.add(station_dict["name"])
        wind = get_available_wind(station_dict)
        if len(wind):
            available_wind[station_dict["name"]] = wind
            stations_with_wind.add(station_dict["name"])
    df_precipitation = pandas.DataFrame(available_precipitation)
    df_wind = pandas.DataFrame(available_wind)
    result_file_precipitation = os.path.join(
        PROCESSED_DATA_DIR, "misc",
        "precipitation_per_month_%s.csv" % repository_parameter.value)
    df_precipitation.to_csv(result_file_precipitation)
    result_file_wind = os.path.join(
        PROCESSED_DATA_DIR, "misc",
        "wind_per_month_%s.csv" % repository_parameter.value)
    df_wind.to_csv(result_file_wind)

    station_dicts_wind = os.path.join(PROCESSED_DATA_DIR, "filtered_stations",
                                      "station_dicts_wind.csv")
    df_data = []
    for station_with_wind in stations_with_wind:
        meta_info = station_repository.get_meta_info(station_with_wind)
        df_data.append({
            "station": station_with_wind,
            "lat": meta_info.lat,
            "lon": meta_info.lon
        })
    df = pandas.DataFrame(df_data)
    df.set_index("station", inplace=True)
    df.to_csv(station_dicts_wind)

    station_dicts_precipitation = os.path.join(
        PROCESSED_DATA_DIR, "filtered_stations",
        "station_dicts_precipitation.csv")
    df_data = []
    for station_with_precipitation in stations_with_precipitation:
        meta_info = station_repository.get_meta_info(
            station_with_precipitation)
        df_data.append({
            "station": station_with_precipitation,
            "lat": meta_info.lat,
            "lon": meta_info.lon,
        })
    df = pandas.DataFrame(df_data)
    df.set_index("station", inplace=True)
    df.to_csv(station_dicts_precipitation)