Example #1
0
def main():
    config = get_makeprediction_config()
    # *********** Reads the parameters ***********

    input_file = config[ClassificationParams.input_file]
    splits_file = config[ClassificationParams.split_file]
    output_folder = config[ClassificationParams.output_folder]
    output_imgs_folder = config[ClassificationParams.output_imgs_folder]
    output_file_name = config[ClassificationParams.output_file_name]
    run_name = config[TrainingParams.config_name]
    model_weights_file = config[ClassificationParams.model_weights_file]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]
    disp_images = config[ClassificationParams.show_imgs]
    generate_images = config[ClassificationParams.generate_images]
    metrics_user = config[ClassificationParams.metrics]
    filter_stations = config[LocalTrainingParams.stations]

    # Iterate over the stations
    # Selects the proper model file for the current station
    assert len(model_weights_file) > 0
    assert len(input_file) > 0

    print(F"Working with: {model_weights_file} \n and \n {input_file}")

    data = pd.read_csv(input_file, index_col=0, parse_dates=True)

    all_data_cols = data.columns
    date_columns = [
        x for x in all_data_cols if (x.find('week') != -1) or (
            x.find('hour') != -1) or (x.find('year') != -1)
    ]
    stations_columns = [
        x for x in all_data_cols
        if (x.find('h') == -1) and (x not in date_columns)
    ]
    meteo_columns = [
        x for x in all_data_cols if (x.find('h') != -1) and (
            x not in date_columns) and (x not in stations_columns)
    ]
    desired_columns = meteo_columns + filter_stations + date_columns

    print("Appending date hot vector...")
    date_hv = generate_date_hot_vector(data.index)
    data = pd.concat([data[desired_columns], date_hv], axis=1)
    print("Done!")

    # print("Filtering data to hours 9 to 20...")
    filtered_data = data.between_time("9:00", "20:00")
    # filtered_data = data
    datetimes_str = filtered_data.index.values
    # print("Done!")

    print(F'Normalizing and filtering data....')
    parameters_folder = join(dirname(output_folder), 'Training', 'Parameters')
    data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns = \
        normalizeAndFilterData(filtered_data, datetimes_str, forecasted_hours, output_folder=parameters_folder,
                               run_name=run_name, read_from_file=True)

    # ********* Filling nan values in the stations with the mean values of all the 'available' stations ********
    X_df = data_norm_df_final.loc[datetimes_str[accepted_times_idx]]
    Y_df = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns]

    # ********* Filling nan values in the stations with the mean values of all the 'available' stations ********
    # for cur_station in stations_columns:
    #     X_df[cur_station] = X_df[cur_station].fillna(X_df['MEAN'])
    #     Y_df[cur_station] = Y_df[cur_station].fillna(data_norm_df_final.loc[datetimes_str[y_times_idx]]['MEAN'])

    # X = data_norm_df_final.loc[datetimes_str[accepted_times_idx]].values
    # X_df = X_df.drop(columns=['MEAN'])
    X_df = X_df.drop(columns=stations_columns)
    X = X_df.values
    # Y = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns].values
    Y = Y_df.values

    config[ModelParams.INPUT_SIZE] = len(X_df.columns)
    print(F'X shape: {X.shape} Y shape: {Y.shape}')

    # *********** Chooses the proper model ***********
    print('Reading model ....')
    config[ModelParams.NUMBER_OF_OUTPUT_CLASSES] = Y.shape[1]
    model = select_1d_model(config)

    # *********** Chooses the proper model ***********
    print('Reading splits info....')
    if splits_file != '':  # In this case we do read the information
        split_info = pd.read_csv(splits_file, dtype=np.int16)
    else:
        split_info = pd.DataFrame({
            'train_ids': [],
            'validation_ids': [],
            'test_id': []
        })
        split_info['train_ids'] = range(Y.shape[0])

    # *********** Reads the weights***********
    print('Reading weights ....')
    model.load_weights(model_weights_file)

    # ************ Makes NN Prediction ********
    print('Making prediction ....')
    output_nn_all = model.predict(X, verbose=1)

    # ************ Saves raw results ********
    number_of_examples = 10
    if generate_images:
        img_viz = EOAImageVisualizer(output_folder=output_imgs_folder,
                                     disp_images=disp_images)

        Y[Y == -1] = np.nan  # So that we do not show the -1
        for c_example in range(number_of_examples):
            hours_to_plot = 24 * 3  # How many points to plot
            start_idx = np.random.randint(
                0, X.shape[0] - hours_to_plot - forecasted_hours)
            end_idx = start_idx + hours_to_plot
            create_folder(output_folder)
            create_folder(output_imgs_folder)
            for idx_station, cur_station in enumerate(filter_stations):
                img_viz.plot_1d_data_np(
                    datetimes_str[y_times_idx][start_idx:end_idx], [
                        Y[start_idx:end_idx, idx_station],
                        output_nn_all[start_idx:end_idx, idx_station]
                    ],
                    title=F'{cur_station}',
                    labels=['GT', 'NN'],
                    file_name_prefix=F'{cur_station}_{c_example}')

    # ************ Recovering original units********
    print('Recovering original units....')
    nn_df = pd.DataFrame(output_nn_all,
                         columns=stations_columns,
                         index=filtered_data.index[y_times_idx])
    nn_original_units = deNormalize(nn_df)
    Y_original = deNormalize(Y_df)

    # ************ Computing metrics********
    print('Computing metrics and saving predictions....')
    compute_metrics(Y_original, nn_original_units, metrics_user, split_info,
                    output_file_name, stations_columns)
Example #2
0
def compute_consecutive_days_difference():
    """
    Computes the difference between consecutive days on the hycom files.
    :param proc_id:
    :return:
    """
    config = get_preproc_config()
    input_folder_forecast = config[PreprocParams.input_folder_hycom]
    output_folder = config[PreprocParams.imgs_output_folder]
    YEARS = config[PreprocParams.YEARS]
    MONTHS = config[PreprocParams.MONTHS]
    fields = config[PreprocParams.fields_names]
    layers = config[PreprocParams.layers_to_plot]

    img_viz = EOAImageVisualizer(output_folder=output_folder,
                                 disp_images=False)

    # Iterate current year
    for c_year in YEARS:
        # Iterate current month
        diff_per_field = {field: [] for field in fields}
        days_with_data = []
        for c_month in MONTHS:
            # Reading the data
            try:
                days_of_month, days_of_year = get_days_from_month(c_month)
                # Reading hycom files
                hycom_files, hycom_paths = get_hycom_file_name(
                    input_folder_forecast, c_year, c_month)
            except Exception as e:
                print(F"Failed to find any file for date {c_year}-{c_month}")
                continue

            # This for is fixed to be able to run in parallel
            for c_day_of_month, c_day_of_year in enumerate(days_of_year):
                print(
                    F"---------- Year {c_year} day: {c_day_of_year} --------------"
                )
                # Makes regular expression of the current desired file
                re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a'
                re_hycom_prev = F'archv.{c_year}_{(c_day_of_year-1):03d}\S*.a'
                try:
                    # Gets the proper index of the file for the three cases
                    hycom_file_idx = [
                        i for i, file in enumerate(hycom_files)
                        if re.search(re_hycom, file) != None
                    ][0]
                    hycom_file_idx_prev = [
                        i for i, file in enumerate(hycom_files)
                        if re.search(re_hycom_prev, file) != None
                    ][0]
                except Exception as e:
                    print(
                        F"ERROR: The file for date {c_year} - {c_month} - {c_day_of_month} (and prev day) don't exist: {e}"
                    )
                    continue

                days_with_data.append(c_day_of_year)
                model_state_np_fields = read_hycom_fields(
                    hycom_paths[hycom_file_idx], fields, layers=layers)
                model_state_np_fields_prev = read_hycom_fields(
                    hycom_paths[hycom_file_idx_prev], fields, layers=layers)
                # Computes the difference between consecutive days from the desired fields
                for idx_field, c_field_name in enumerate(fields):
                    model_state_np_c_field = model_state_np_fields[
                        c_field_name]
                    model_state_np_c_field_prev = model_state_np_fields_prev[
                        c_field_name]
                    c_diff = np.abs(
                        np.nanmean(model_state_np_c_field_prev -
                                   model_state_np_c_field))
                    diff_per_field[c_field_name].append(c_diff)

        # Plots the differences between consecutive days. For all the fields together.
        img_viz.plot_1d_data_np(
            days_with_data, [diff_per_field[a] for a in diff_per_field.keys()],
            title='Difference between days',
            labels=fields,
            file_name_prefix='HYCOM_Diff_Between_Days',
            wide_ratio=4)
        # Plots the differences between consecutive days. Separated by fields
        for field in diff_per_field.keys():
            img_viz.plot_1d_data_np(
                days_with_data, [diff_per_field[field]],
                title=F'Difference between days {field}',
                labels=[field],
                file_name_prefix=F'HYCOM_Diff_Between_Days_{field}',
                wide_ratio=4)
def trainModel(config, cur_pollutant, cur_station):
    """Trying to separate things so that tf 'cleans' the memory """

    input_folder = config[TrainingParams.input_folder]
    output_folder = config[TrainingParams.output_folder]

    val_perc = config[TrainingParams.validation_percentage]
    test_perc = config[TrainingParams.test_percentage]
    eval_metrics = config[TrainingParams.evaluation_metrics]
    loss_func = config[TrainingParams.loss_function]
    batch_size = config[TrainingParams.batch_size]
    epochs = config[TrainingParams.epochs]
    model_name_user = config[TrainingParams.config_name]
    optimizer = config[TrainingParams.optimizer]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]

    split_info_folder = join(output_folder, 'Splits')
    parameters_folder = join(output_folder, 'Parameters')
    weights_folder = join(output_folder, 'models')
    logs_folder = join(output_folder, 'logs')
    imgs_folder = join(output_folder, 'imgs')
    create_folder(split_info_folder)
    create_folder(parameters_folder)
    create_folder(weights_folder)
    create_folder(logs_folder)

    viz_obj = EOAImageVisualizer(output_folder=imgs_folder, disp_images=False)

    print(
        F"============ Reading data for: {cur_pollutant} -- {cur_station} =========================="
    )
    db_file_name = join(input_folder, constants.merge_output_folder.value,
                        F"{cur_pollutant}_{cur_station}.csv")
    data = pd.read_csv(db_file_name, index_col=0)

    config[ModelParams.INPUT_SIZE] = len(data.columns)
    print(F'Data shape: {data.shape} Data axes {data.axes}')
    print("Done!")

    # Predicting for the next value after 24hrs (only one)
    print("Normalizing data....")
    datetimes_str = data.index.values
    datetimes = np.array([
        datetime.strptime(x, constants.datetime_format.value)
        for x in datetimes_str
    ])

    scaler = preprocessing.MinMaxScaler()
    scaler = scaler.fit(data)
    data_norm_np = scaler.transform(data)
    data_norm_df = DataFrame(data_norm_np,
                             columns=data.columns,
                             index=data.index)
    print(F'Done!')

    # Filtering only dates where there is data "forecasted hours after" (24 hrs after)
    print(F"\tBuilding X and Y ....")
    accepted_times_idx = []
    y_times_idx = []
    for i, c_datetime in enumerate(datetimes):
        forecasted_datetime = (c_datetime + timedelta(hours=forecasted_hours))
        if forecasted_datetime in datetimes:
            accepted_times_idx.append(i)
            y_times_idx.append(
                np.argwhere(forecasted_datetime == datetimes)[0][0])

    X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]]
    Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][cur_pollutant]
    X = X_df.values
    Y = Y_df.values

    print(F'X shape: {X.shape} Y shape: {Y.shape}')

    tot_examples = X.shape[0]
    rows_to_read = np.arange(tot_examples)

    # ================ Split definition =================
    [train_ids, val_ids, test_ids
     ] = utilsNN.split_train_validation_and_test(tot_examples,
                                                 val_percentage=val_perc,
                                                 test_percentage=test_perc)

    print("Train examples (total:{}) :{}".format(len(train_ids),
                                                 rows_to_read[train_ids]))
    print("Validation examples (total:{}) :{}:".format(len(val_ids),
                                                       rows_to_read[val_ids]))
    print("Test examples (total:{}) :{}".format(len(test_ids),
                                                rows_to_read[test_ids]))

    print("Selecting and generating the model....")
    now = datetime.utcnow().strftime("%Y_%m_%d_%H_%M")
    model_name = F'{model_name_user}_{now}_{cur_pollutant}_{cur_station}'

    # ******************* Selecting the model **********************
    model = select_1d_model(config)
    plot_model(model,
               to_file=join(output_folder, F'{model_name}.png'),
               show_shapes=True)

    print("Saving split information...")
    file_name_splits = join(split_info_folder, F'{model_name}.csv')
    info_splits = DataFrame({F'Train({len(train_ids)})': train_ids})
    info_splits[F'Validation({len(val_ids)})'] = 0
    info_splits[F'Validation({len(val_ids)})'][0:len(val_ids)] = val_ids
    info_splits[F'Test({len(test_ids)})'] = 0
    info_splits[F'Test({len(test_ids)})'][0:len(test_ids)] = test_ids
    info_splits.to_csv(file_name_splits, index=None)

    print(F"Norm params: {scaler.get_params()}")
    file_name_normparams = join(parameters_folder, F'{model_name}.txt')
    utilsNN.save_norm_params(file_name_normparams, NormParams.min_max, scaler)
    info_splits.to_csv(file_name_splits, index=None)

    print("Getting callbacks ...")

    [logger, save_callback, stop_callback] = utilsNN.get_all_callbacks(
        model_name=model_name,
        early_stopping_func=F'val_{eval_metrics[0].__name__}',
        weights_folder=weights_folder,
        logs_folder=logs_folder)

    print("Compiling model ...")
    model.compile(loss=loss_func, optimizer=optimizer, metrics=eval_metrics)

    print("Training ...")
    # This part should be somehow separated, it will change for every project
    x_train = X[train_ids, :]
    y_train = Y[train_ids]
    x_val = X[val_ids, :]
    y_val = Y[val_ids]
    x_test = X[test_ids, :]
    y_test = Y[test_ids]

    # Plotting some intermediate results
    import matplotlib.pyplot as plt
    size = 24 * 60  # Two months of data
    start = np.random.randint(0, len(data) - size)
    end = start + size
    plt.figure(figsize=[64, 8])
    x_plot = range(len(X_df.iloc[start:end].index.values))
    y_plot = X_df.iloc[start:end][cur_pollutant].values
    yy_plot = Y_df.iloc[start:end].values
    viz_obj.plot_1d_data_np(x_plot, [y_plot, yy_plot],
                            title=F"{cur_pollutant}_{cur_station}",
                            labels=['Current', 'Desired'],
                            wide_ratio=4,
                            file_name_prefix=F"{cur_pollutant}_{cur_station}")

    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_val, y_val),
              shuffle=True,
              callbacks=[logger, save_callback, stop_callback])
def main():
    config = get_makeprediction_config()
    # *********** Reads the parameters ***********

    input_file = config[ClassificationParams.input_file]
    output_folder = config[ClassificationParams.output_folder]
    output_imgs_folder = config[ClassificationParams.output_imgs_folder]
    output_file_name = config[ClassificationParams.output_file_name]
    model_weights_file = config[ClassificationParams.model_weights_file]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]
    pollutant = config[LocalTrainingParams.pollutant]

    # ********** Reading and preprocessing data *******
    _all_stations = [
        "ACO", "AJM", "AJU", "ARA", "ATI", "AZC", "BJU", "CAM", "CCA", "CES",
        "CFE", "CHO", "COR", "COY", "CUA", "CUI", "CUT", "DIC", "EAJ", "EDL",
        "FAC", "FAN", "GAM", "HAN", "HGM", "IBM", "IMP", "INN", "IZT", "LAA",
        "LAG", "LLA", "LOM", "LPR", "LVI", "MCM", "MER", "MGH", "MIN", "MON",
        "MPA", "NET", "NEZ", "PED", "PER", "PLA", "POT", "SAG", "SFE", "SHA",
        "SJA", "SNT", "SUR", "TAC", "TAH", "TAX", "TEC", "TLA", "TLI", "TPN",
        "UAX", "UIZ", "UNM", "VAL", "VIF", "XAL", "XCH"
    ]

    # Iterate over the stations
    models_folder = '/data/UNAM/Air_Pollution_Forecast/Data/Training/models'
    data_folder = '/data/UNAM/Air_Pollution_Forecast/Data/MergedDataCSV'
    for c_station in _all_stations:
        try:
            model_weights_file = [
                join(models_folder, x) for x in listdir(models_folder)
                if x.find(c_station) != -1
            ]
            input_file = [
                join(data_folder, x) for x in listdir(data_folder)
                if x.find(c_station) != -1
            ]
            # Selects the proper model file for the current station
            assert len(model_weights_file) > 0
            assert len(input_file) > 0

            print(F"Working with: {model_weights_file} and {input_file}")
            model_weights_file = model_weights_file[0]
            input_file = input_file[0]

            data = pd.read_csv(input_file, index_col=0)

            config[ModelParams.INPUT_SIZE] = len(data.columns)
            print(F'Data shape: {data.shape} Data axes {data.axes}')
            print("Done!")

            # Predicting for the next value after 24hrs (only one)
            print("Normalizing data....")
            datetimes_str = data.index.values
            datetimes = np.array([
                datetime.strptime(x, constants.datetime_format.value)
                for x in datetimes_str
            ])

            scaler = preprocessing.MinMaxScaler()
            scaler = scaler.fit(data)
            data_norm_np = scaler.transform(data)
            data_norm_df = DataFrame(data_norm_np,
                                     columns=data.columns,
                                     index=data.index)
            print(F'Done!')

            # Filtering only dates where there is data "forecasted hours after" (24 hrs after)
            print(F"\tBuilding X and Y ....")
            accepted_times_idx = []
            y_times_idx = []
            for i, c_datetime in enumerate(datetimes):
                forecasted_datetime = (c_datetime +
                                       timedelta(hours=forecasted_hours))
                if forecasted_datetime in datetimes:
                    accepted_times_idx.append(i)
                    y_times_idx.append(
                        np.argwhere(forecasted_datetime == datetimes)[0][0])

            X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]]
            Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][pollutant]
            X = X_df.values
            Y = Y_df.values

            print(F'X shape: {X.shape} Y shape: {Y.shape}')

            # *********** Chooses the proper model ***********
            print('Reading model ....')
            model = select_1d_model(config)

            # *********** Reads the weights***********
            print('Reading weights ....')
            model.load_weights(model_weights_file)

            create_folder(output_folder)
            create_folder(output_imgs_folder)

            # *********** Makes a dataframe to contain the DSC information **********
            metrics_params = config[ClassificationParams.metrics]
            metrics_dict = {met.name: met.value for met in metrics_params}

            # *********** Iterates over each case *********
            t0 = time.time()
            # -------------------- Reading data -------------
            output_nn_all = model.predict(X, verbose=1)

            # Plotting some intermediate results
            import matplotlib.pyplot as plt
            size = 24 * 60  # Two months of data
            start = np.random.randint(0, len(data) - size)
            end = start + size
            plt.figure(figsize=[64, 8])
            x_plot = range(len(Y))
            y_plot = Y
            yy_plot = Y_df.iloc[start:end].values
            viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder,
                                         disp_images=False)
            plot_this_many = 24 * 60
            viz_obj.plot_1d_data_np(
                x_plot[0:plot_this_many],
                [y_plot[0:plot_this_many], output_nn_all[0:plot_this_many, 0]],
                title=F"{c_station} {pollutant}",
                labels=['Original', 'Forecasted'],
                wide_ratio=4,
                file_name_prefix=F"{pollutant}_{c_station}")

            print(F'\t Done! Elapsed time {time.time() - t0:0.2f} seg')

        except Exception as e:
            print(
                F"---------------------------- Failed {c_station} error: {e} ----------------"
            )