def csv_ld_fill():
    aq_location, aq_dicts = ld_raw_fetch.load_aq_original()
    for aq_name in aq_location:
        aq_dict = aq_dicts[aq_name]
        start_dt_o, end_dt_o = datetime.strptime(list(aq_dict.keys())[0], format_string), \
                               datetime.strptime(list(aq_dict.keys())[-1], format_string)

        # Firstly fill lost time with all None value
        for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)):
            # dt_s = dt_o.strftime(format_string)
            dt_s = format_ld_dt_string(dt_o)
            try:
                data = aq_dict[dt_s]
            except KeyError:
                aq_dict[dt_s] = [None] * 3

        for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)):
            dt_s = format_ld_dt_string(dt_o)
            if dt_s == "2018/3/30 22:00" and aq_name == "BX1":
                print()
            data = aq_dict[dt_s]
            for column in range(len(data)):
                if data[column] is not None and data[column] < 0:
                    data[column] = None
            del aq_dict[dt_s]
            aq_dict[dt_s] = data

        start_dt_o += timedelta(hours=1)
        end_dt_o -= timedelta(hours=1)
        count = 0
        # Then fill data if only one row is lost
        for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)):
            # dt_s = dt_o.strftime(format_string)
            dt_s = format_ld_dt_string(dt_o)
            data = aq_dict[dt_s]
            previous = aq_dict[format_ld_dt_string(dt_o - timedelta(hours=1))]
            following = aq_dict[format_ld_dt_string(dt_o + timedelta(hours=1))]
            for column in range(len(data)):
                if (data[column] is None) or \
                        (column == 1 and (data[column] > 200)) or (column == 2 and (data[column] > 300)):
                    if previous[column] is not None and following[
                            column] is not None:
                        data[column] = (previous[column] +
                                        following[column]) / 2
                        count += 1
                    else:
                        data[column] = None
            del aq_dict[dt_s]
            aq_dict[dt_s] = data
        print("Filled data in ", aq_name, ": ", count, sep='')

        # Write into csv
        with open("../data_ld_m/aq/" + aq_name + ".csv", "w",
                  newline='') as file:
            writer = csv.writer(file, delimiter=',')
            for dt_s in aq_dict.keys():
                dt_s_m = datetime.strptime(
                    dt_s, format_string).strftime(format_string_m)
                writer.writerow([dt_s_m] + aq_dict[dt_s])
            file.flush()
def csv_bj_fill():
    aq_location, aq_dicts = bj_raw_fetch.load_aq_original()
    for aq_name in aq_location:
        aq_dict = aq_dicts[aq_name]
        start_dt_o, end_dt_o = datetime.strptime(list(aq_dict.keys())[0], format_string_m), \
                               datetime.strptime(list(aq_dict.keys())[-1], format_string_m)

        # Firstly fill lost time with all None value
        for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)):
            # dt_s = dt_o.strftime(format_string_m)
            dt_s = dt_o.strftime(format_string_m)
            try:
                data = aq_dict[dt_s]
            except KeyError:
                aq_dict[dt_s] = [None] * 6

        start_dt_o += timedelta(hours=1)
        end_dt_o -= timedelta(hours=1)
        count = 0
        # Then fill data if only one row is lost
        for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)):
            # dt_s = dt_o.strftime(format_string_m)
            dt_s = dt_o.strftime(format_string_m)
            if dt_s == "2018-04-22 00:00:00":
                print()
            data = aq_dict[dt_s]
            previous = aq_dict[(dt_o -
                                timedelta(hours=1)).strftime(format_string_m)]
            following = aq_dict[(dt_o +
                                 timedelta(hours=1)).strftime(format_string_m)]
            modified = False
            for column in range(len(data)):
                if data[column] is None:
                    if previous[column] is not None and following[
                            column] is not None:
                        data[column] = (previous[column] +
                                        following[column]) / 2
                        count += 1
                        modified = True
                    else:
                        pass
            del aq_dict[dt_s]
            aq_dict[dt_s] = data
        print("Filled data in ", aq_name, ": ", count, sep='')

        # Write into csv
        with open("../data_m/aq/" + aq_name + ".csv", "w", newline='') as file:
            writer = csv.writer(file, delimiter=',')
            for dt_s in aq_dict.keys():
                writer.writerow([dt_s] + aq_dict[dt_s])
                file.flush()
Example #3
0
def get_time_string(start_time_s, end_time_s, time_delta=timedelta(hours=1)):
    time_string_array = []
    start_time = datetime.strptime(start_time_s, format_string)
    end_time = datetime.strptime(end_time_s, format_string)
    for time in per_delta(start_time, end_time, time_delta):
        time_string_array.append(time.strftime(format_string))
    return time_string_array
Example #4
0
def export_data(read_start_string, read_end_string, export_start_string=None,
                export_end_string=None, use_fill=True):
    start_string, end_string = read_start_string, read_end_string
    global aq_location, grid_location, aq_dicts, grid_dicts
    aq_location, grid_location, aq_dicts, grid_dicts = ld_raw_fetch.load_all(start_string, end_string)
    if use_fill:
        aq_dicts = ld_raw_fetch.load_filled_dicts(start_string, end_string)

    if export_start_string is not None:
        start_string, end_string = export_start_string, export_end_string
    h5_file = h5py.File("../data_ld/tradition_export/traditional_ld_{}_{}.h5".format(start_string, end_string), "w")
    print("\nFetching data to export...")
    for aq_name in aq_location.keys():
        start_datetime, end_datetime = datetime.strptime(start_string, format_string_2), \
                                       datetime.strptime(end_string, format_string_2)

        last_valid_dt_object = None
        data_to_write = []
        for dt_object_day in per_delta(start_datetime, end_datetime, timedelta(days=1)):
            have_valid = False
            data_matrix = []

            for dt_object in per_delta(dt_object_day - timedelta(hours=23), dt_object_day, timedelta(hours=1)):
                try:
                    row = list()
                    dt_string = dt_object.strftime(format_string)

                    row += [dt_object.timestamp()] +\
                    [dt_object.weekday()] + \
                    [[1, 0][dt_object.weekday() in range(5)]]
                    # + \
                    # [[0, 1][dt_object.date in holiday_array]]
                    row += (aq_dicts[aq_name][dt_string])
                    nearest_grid = get_nearest(aq_name)
                    row += (grid_dicts[nearest_grid][dt_string])

                    # other_aq = copy.copy(aq_location)
                    # del other_aq[aq_name]
                    #
                    # factor_dict = dict()
                    # for other_aq_id in other_aq.keys():
                    #     factor = cal_affect_factor(other_aq_id, aq_name, dt_string)
                    #     factor_dict[other_aq_id] = factor
                    # sorted_factor_dict = sorted(factor_dict.items(), key=operator.itemgetter(1), reverse=True)
                    # valid = False
                    # other_aq_row = [None] * 2
                    # for other_aq_id, factor in sorted_factor_dict:
                    #     if factor < 0:
                    #         valid = False
                    #         break
                    #     try:
                    #         other_aq_row = aq_dicts[other_aq_id][dt_string]
                    #         valid = True
                    #     except KeyError:
                    #         valid = False
                    #     if valid:
                    #         row += [factor] + other_aq_row
                    #         break
                    # if not valid:
                    #     raise KeyError("Data loss here")

                    data_matrix.append(row)
                    have_valid = True

                except KeyError as e:
                    have_valid = False
                    break
            if have_valid:
                last_valid_dt_object = dt_object_day
                data_to_write = data_matrix
        if last_valid_dt_object is not None:
            print("{} last valid data - {}".format(aq_name, last_valid_dt_object.strftime(format_string_2)))
            h5_file.create_dataset(aq_name, data=np.asarray(data_to_write))
        else:
            print("{} has no valid data".format(aq_name))
    h5_file.flush()
    h5_file.close()
def fill_api_data(city, data_type, start_str, end_str, fill_range=3):
    directory = "data_{}_api/{}/{}_{}".format(city, data_type, start_str, end_str)
    location = location_dict[city][data_type]
    data_dicts = dict()
    errors = []

    column_start = {"aq": 1, "meo": 2}[data_type]

    start_obj, end_obj = datetime.strptime(start_str, format_string[1]), \
                         datetime.strptime(end_str, format_string[1])

    modified_directory = "data_{}_api_m/{}/{}_{}".format(city, data_type, start_str, end_str)
    if not os.path.exists(modified_directory):
        os.makedirs(modified_directory)
    else:
        shutil.rmtree(modified_directory)
        os.makedirs(modified_directory)

    for location_name in location.keys():
        filled_count = 0

        data_dict = dict()
        with open("{}/{}.csv".format(directory, location_name), "r") as csv_file:
            reader = csv.reader(csv_file, delimiter=',')
            for row in reader:
                if data_type == "aq":
                    data_dict[row[0]] = list(
                        map(float_m, row[column_start:data_column_scope[data_type][city] + column_start]))
                elif data_type == "meo":
                    data_dict[row[0]] = list(
                        map(float_zero, row[column_start:data_column_scope[data_type][city] + column_start]))

            # Fill timestamp loss with None
            for dt_obj in tools.per_delta(start_obj, end_obj, timedelta(hours=1)):
                dt_str = dt_obj.strftime(format_string[0])
                try:
                    data_dict[dt_str]
                except KeyError:
                    data_dict[dt_str] = [None] * data_column_scope[data_type][city]

            # Fill data if possible
            dt_obj = start_obj
            while dt_obj < end_obj:
                dt_obj += timedelta(hours=1)
                dt_str = dt_obj.strftime(format_string[0])
                current_data = data_dict[dt_str]
                for column in range(data_column_scope[data_type][city]):
                    try:
                        if current_data[column] is None:
                            # Found None value, begin counting the length of empty data
                            count = 1
                            while True:
                                if data_dict[(dt_obj + timedelta(hours=count)).
                                        strftime(format_string[0])][column] is None:
                                    count += 1
                                else:
                                    break
                            if count > fill_range:
                                raise KeyError("Too much data is lost.")
                            start_value = data_dict[(dt_obj - timedelta(hours=1)).
                                strftime(format_string[0])][column]
                            if start_value is None:
                                raise KeyError("Data is empty in the first row.")
                            end_value = data_dict[(dt_obj + timedelta(hours=count)).
                                strftime(format_string[0])][column]
                            gradient = (end_value - start_value) / (count + 1)
                            for i in range(count):
                                data_dict[(dt_obj + timedelta(hours=i)).
                                    strftime(format_string[0])][column] = start_value + (i + 1) * gradient
                                filled_count += 1
                    except KeyError as e:
                        errors.append(e)
                        continue
        data_dicts[location_name] = data_dict

        sorted_data_matrix = sorted(data_dict.items(), key=operator.itemgetter(0))
        with open("{}/{}.csv".format(modified_directory, location_name), "w", newline='') as csv_file:
            writer = csv.writer(csv_file, delimiter=',')
            for dt_str, data in sorted_data_matrix:
                writer.writerow([dt_str] + data)
            csv_file.flush()
Example #6
0
                          Bar('=', '[', ']'), ' ',
                          Percentage()])

        valid_count = 0

        # Validate the near grid matrix algorithm
        # plt.figure()
        # plt.title(aq_name)
        # plt.plot(aq_location[aq_name][0], aq_location[aq_name][1], '.')
        # plt.plot(grid_coor_array[:, 0], grid_coor_array[:, 1], '.')
        # plt.show()

        # Exporting data from start to end
        predict_matrix = []
        dt_int_array = []
        for dt_object_day in per_delta(start_datetime, end_datetime,
                                       timedelta(hours=24)):
            for dt_object in per_delta(dt_object_day,
                                       dt_object_day + timedelta(hours=2),
                                       timedelta(hours=1)):
                aggregate += 1
                bar.update(aggregate)
                dt_string = dt_object.strftime(format_string)

                # Fetch history and prediction data, check data validation in the same time
                predict = check_valid(aq_name, dt_object)
                if predict is None:
                    continue

                # Append this hour's data into per-day data
                predict_matrix.append(predict)
                dt_int_array.append(int(ti.mktime(dt_object.timetuple())))
        valid_count = 0
        near_grids, grid_coor_array = get_grids(aq_name, grid_edge_length)

        # Validate the near grid matrix algorithm
        # plt.figure()
        # plt.title(aq_name)
        # plt.plot(aq_location[aq_name][0], aq_location[aq_name][1], '.')
        # plt.plot(grid_coor_array[:, 0], grid_coor_array[:, 1], '.')
        # plt.show()

        grid_matrix = []
        history_matrix = []
        predict_matrix = []
        dt_int_array = []
        fake_forecast_matrix = []
        for dt_object in per_delta(start_datetime, end_datetime, timedelta(hours=1)):
            aggregate += 1
            bar.update(aggregate)
            dt_string = dt_object.strftime(format_string)

            # Fetch history and prediction data, check data validation in the same time
            aq_matrix, predict, near_grid_data, fake_forecast_data = check_valid(aq_name, dt_object, time_span)
            if aq_matrix is None:
                continue

            grid_matrix.append(near_grid_data)
            history_matrix.append(aq_matrix)
            predict_matrix.append(predict)
            dt_int_array.append(dt_object.timestamp())
            fake_forecast_matrix.append(fake_forecast_data)
            valid_count += 1
def export_data(city, read_start_string, read_end_string, export_start_string,
                export_end_string, use_fill):
    start_string, end_string = read_start_string, read_end_string
    global aq_location, grid_location, grid_dicts, aq_dicts, forecast_directory, export_directory
    forecast_directory = forecast_directory_dict[city]
    export_directory = export_directory_dict[city]
    if city == "bj":
        aq_location, grid_location, aq_dicts, grid_dicts = bj_raw_fetch.load_all(
            start_string, end_string)
        if use_fill:
            aq_dicts = bj_raw_fetch.load_filled_dicts(start_string, end_string)
    elif city == "ld":
        aq_location, grid_location, aq_dicts, grid_dicts = ld_raw_fetch.load_all(
            start_string, end_string)
        if use_fill:
            aq_dicts = ld_raw_fetch.load_filled_dicts(start_string, end_string)

    if export_start_string is None:
        start_string, end_string = read_start_string, read_end_string
    else:
        start_string, end_string = export_start_string, export_end_string
    start_datetime, end_datetime = datetime.strptime(start_string, format_string_2), \
                                   datetime.strptime(end_string, format_string_2)

    data_dir = export_directory.format(start_string, end_string)

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    print("\nExporting to {}".format(data_dir))

    for aq_name in aq_location.keys():
        # if not aq_name in ["KF1"]:
        #     continue

        valid_count = 0
        near_grids, grid_coor_array = get_grids(aq_name, grid_circ)

        # Exporting data from start to end
        last_valid_dt_object = None
        grid_matrix, history_matrix, dt_int_array, forecast_matrix = tuple(
            [1] * 4)
        for dt_object in per_delta(start_datetime, end_datetime,
                                   timedelta(hours=24)):
            # Fetch history and prediction data, check data validation in the same time
            aq_matrix, near_grid_data, forecast_data, predict = check_valid(
                aq_name, dt_object, near_grids)
            if aq_matrix is None:
                continue

            # Append this hour's data into per-day data
            grid_matrix = [near_grid_data]
            history_matrix = [aq_matrix]
            dt_int_array = [dt_object.timestamp()]
            forecast_matrix = [forecast_data]
            valid_count += 1

            last_valid_dt_object = dt_object

        if last_valid_dt_object is not None:
            h5_file = h5py.File("{}/{}.h5".format(data_dir, aq_name), "w")
            h5_file.create_dataset("grid", data=np.asarray(grid_matrix))
            h5_file.create_dataset("history", data=np.asarray(history_matrix))
            h5_file.create_dataset("timestep", data=np.asarray(dt_int_array))
            h5_file.create_dataset("weather_forecast",
                                   data=np.asarray(forecast_matrix))
            h5_file.flush()
            h5_file.close()
            print("{} - Have data, last valid {}".format(
                aq_name, last_valid_dt_object.strftime(format_string_2)))
        else:
            print("{} - No valid data".format(aq_name))
Example #9
0
def export_data(city, read_start_string, read_end_string, export_start_string,
                export_end_string, use_fill, use_history, export_train):
    start_string, end_string = read_start_string, read_end_string
    global ci
    ci = city
    global aq_location, grid_location, aq_dicts, grid_dicts, df
    if use_history:
        if city == "ld":
            aq_location, grid_location, aq_dicts, grid_dicts = ld_raw_fetch.load_all_history(
            )
            df = load_data.load_directory_data(
                load_data.history_data_directory[city]["aq"],
                load_data.data_header_dict[city]["aq"],
                drop=["no2"])
        elif city == "bj":
            aq_location, grid_location, aq_dicts, grid_dicts = bj_raw_fetch.load_all_history(
            )
            df = load_data.load_directory_data(
                load_data.history_data_directory[city]["aq"],
                load_data.data_header_dict[city]["aq"])

    else:
        if city == "ld":
            aq_location, grid_location, aq_dicts, grid_dicts = ld_raw_fetch.load_all(
                start_string, end_string)
            df = load_data.load_directory_data(
                [
                    load_data.filled_data_directory[city],
                    load_data.history_data_directory[city]["aq"]
                ],
                load_data.data_header_dict[city]["aq"],
                drop=["no2"])
        elif city == "bj":
            aq_location, grid_location, aq_dicts, grid_dicts = bj_raw_fetch.load_all(
                start_string, end_string)
            df = load_data.load_directory_data([
                load_data.filled_data_directory[city],
                load_data.history_data_directory[city]["aq"]
            ], load_data.data_header_dict[city]["aq"])

        if use_fill:
            if city == "ld":
                aq_dicts = ld_raw_fetch.load_filled_dicts(
                    start_string, end_string)
            elif city == "bj":
                aq_dicts = bj_raw_fetch.load_filled_dicts(
                    start_string, end_string)
    global export_predict
    export_predict = export_train

    if export_start_string is not None:
        start_string, end_string = export_start_string, export_end_string
    start_datetime, end_datetime = datetime.strptime(start_string, format_string_2), \
                                   datetime.strptime(end_string, format_string_2)
    diff = end_datetime - start_datetime
    days, seconds = diff.days, diff.seconds
    delta_time = int(days * 24 + seconds // 3600)
    if export_train:
        delta_time = int(delta_time / 24)

    directory = ""
    if export_train:
        if city == "ld":
            directory = "../data_ld/tradition_train/{}_{}".format(
                start_string, end_string)
        elif city == "bj":
            directory = "../data/tradition_train/{}_{}".format(
                start_string, end_string)
    else:
        if city == "ld":
            directory = "../data_ld/tradition_predict/{}_{}".format(
                start_string, end_string)
        elif city == "bj":
            directory = "../data/tradition_predict/{}_{}".format(
                start_string, end_string)
    if not os.path.exists(directory):
        os.makedirs(directory)

    print("\nExporting to {}".format(directory))
    # out_file = open("out{}_{}.txt".format(start_string, end_string), "w")
    for aq_name in aq_location.keys():
        # if aq_name not in ["KF1"]:
        #     continue
        timestamp_matrix, history_aq, history_meo, forecast, predict_aq, statistic = [], [], [], [], [], []
        if export_train:
            aggregate = 0
            valid = 0

            for dt_object in per_delta(start_datetime, end_datetime,
                                       timedelta(hours=24)):
                aggregate += 1
                if aggregate % 10 == 0:
                    print("\t{} exported %3.2f%%".format(aq_name) %
                          (100 * aggregate / delta_time))
                #     out_file.write("{} exported %3.2f%%\n".format(aq_name) % (100 * aggregate / delta_time))
                #     out_file.flush()

                history_aq_matrix, history_meo_matrix, forecast_matrix, predict_matrix, \
                weekday, weekend, timestamp, statistic_matrix = check_valid(aq_name, dt_object)
                if history_aq_matrix is None:
                    continue

                timestamp_matrix.append([timestamp, weekday, weekend])
                history_aq.append(history_aq_matrix)
                history_meo.append(history_meo_matrix)
                forecast.append(forecast_matrix)
                predict_aq.append(predict_matrix)
                statistic.append(statistic_matrix)
                valid += 1

            h5_file = h5py.File("{}/{}.h5".format(directory, aq_name), "w")
            h5_file.create_dataset("timestamp",
                                   data=np.array(timestamp_matrix))
            h5_file.create_dataset("history_aq", data=np.array(history_aq))
            h5_file.create_dataset("history_meo", data=np.array(history_meo))
            h5_file.create_dataset("forecast", data=np.array(forecast))
            h5_file.create_dataset("predict_aq", data=np.array(predict_aq))
            h5_file.create_dataset("statistic", data=np.array(statistic))
            h5_file.flush()
            h5_file.close()
            print("{} finished, valid {}".format(aq_name, valid))
            sleep(0.1)
        else:
            last_valid_dt = None
            for dt_object in per_delta(start_datetime, end_datetime,
                                       timedelta(hours=24)):
                history_aq_matrix, history_meo_matrix, forecast_matrix, predict_matrix, \
                weekday, weekend, timestamp, statistic_matrix = check_valid(aq_name, dt_object)
                if history_aq_matrix is None:
                    continue

                timestamp_matrix = [[timestamp, weekday, weekend]]
                history_aq = [history_aq_matrix]
                history_meo = [history_meo_matrix]
                forecast = [forecast_matrix]
                predict_aq = [predict_matrix]
                statistic = [statistic_matrix]
                last_valid_dt = dt_object
            if last_valid_dt is not None:
                h5_file = h5py.File("{}/{}.h5".format(directory, aq_name), "w")
                h5_file.create_dataset("timestamp",
                                       data=np.array(timestamp_matrix))
                h5_file.create_dataset("history_aq", data=np.array(history_aq))
                h5_file.create_dataset("history_meo",
                                       data=np.array(history_meo))
                h5_file.create_dataset("forecast", data=np.array(forecast))
                h5_file.create_dataset("predict_aq", data=np.array(predict_aq))
                h5_file.create_dataset("statistic", data=np.array(statistic))
                h5_file.flush()
                h5_file.close()
                print("{} last valid {}".format(
                    aq_name, last_valid_dt.strftime(format_string_2)))
            else:
                print("{} no valid data".format(aq_name))