def make_plot(veg_ys, precip_ys, output_dir):

        # set up
        lags = 9
        correlations = []

        # make a new df to ensure NaN veg values are explicit
        df_ = pd.DataFrame()
        df_['precip'] = precip_ys
        df_['offset50'] = veg_ys

        # create fig
        fig, axs = plt.subplots(3,
                                3,
                                sharex='col',
                                sharey='row',
                                figsize=(8, 8))

        # loop through offsets
        for lag in range(0, lags):

            # select the relevant Axis object
            ax = axs.flat[lag]

            # format this subplot
            ax.set_title(f'$t-{lag}$')
            ax.grid(False)

            # plot data
            lagged_data = df_['offset50'].shift(-lag)
            corr = precip_ys.corr(lagged_data)
            correlations.append(round(corr, 4))
            sns.regplot(precip_ys, lagged_data, label=f'$r={corr:.2f}$', ax=ax)

            # format axis label
            if lag < 6:
                ax.set_xlabel('')
            if lag % 3 != 0:
                ax.set_ylabel('')

            ax.legend()

        plt.tight_layout()

        # save the plot
        output_filename = veg_ys.name + '-scatterplot-matrix.png'
        plt.savefig(os.path.join(output_dir, output_filename), dpi=DPI)
        plt.close(fig)

        # write out correlations as a function of lag
        correlations_dict = {veg_ys.name + '_lagged_correlation': correlations}
        write_to_json(os.path.join(output_dir, 'lagged_correlations.json'),
                      correlations_dict)
def preprocess_data(input_dir,
                    drop_outliers=True,
                    fill_missing=True,
                    resample=True,
                    smoothing=True,
                    detrend=True,
                    n_smooth=4,
                    period='MS'):
    """
    This function reads and process data downloaded by GEE. Processing
    can be configured by the function arguments. Processed data is 
    written to csv.

    Parameters
    ----------
    input_dir : str
        Path to the directory created during a GEE download job.
    drop_outliers : bool, optional
        Remove outliers in sub-image time series.
    fill_missing : bool, optional
        Fill missing points in the time series.
    resample : bool, optional
        Resample the time series using linear interpolation.
    smoothing : bool, optional
        Smooth the time series using LOESS smoothing.
    detrend : bool, optional
        Remove seasonal component by subtracting previous year.
    n_smooth : int, optional
        Number of time points to use for the smoothing window size.
    period : str, optional 
        Pandas DateOffset string describing sampling frequency.

    Returns
    ----------
    str
        Path to the csv file containing processed data.   
    """

    # put output plots in the results dir
    output_dir = os.path.join(input_dir, 'processed_data')

    # check input file exists
    json_summary_path = os.path.join(input_dir, 'results_summary.json')
    if not os.path.exists(json_summary_path):
        raise FileNotFoundError(
            f'Could not find file "{os.path.abspath(json_summary_path)}".')

    # make output subdir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # read all json files in the directory and produce a dataframe
    print(f'Reading results from "{os.path.abspath(json_summary_path)}"...')

    # read json file to dataframes
    dfs = read_json_to_dataframes(json_summary_path)

    # keep track of time points where data is missing (by default pandas
    # groupby operations, which is used haveily in this module, drop NaNs)
    missing = get_missing_time_points(dfs)
    missing_json = {k: list(v) for k, v in missing.items()}
    write_to_json(os.path.join(output_dir, 'missing_dates.json'), missing_json)

    print('\nPreprocessing data...')
    print('-' * 21)

    # remove outliers from the time series
    if drop_outliers:
        print('- Dropping vegetation outliers...')
        dfs = drop_veg_outliers(dfs, sigmas=3)

    # use the same month in different years to fill gaps
    if fill_missing:
        print('- Fill gaps in sub-image time series...')
        dfs = fill_veg_gaps(dfs, missing)

    # LOESS smoothing on sub-image time series
    if smoothing:
        print('- Smoothing vegetation time series...')
        dfs = smooth_veg_data(dfs, n=n_smooth)

    # store feature vectors before averaging over sub-images
    print('- Saving feature vectors...')
    store_feature_vectors(dfs, output_dir)

    # average over sub-images
    ts_df = make_time_series(dfs)

    # resample the averaged time series using linear interpolation
    if resample:
        print('- Resampling time series...')
        columns = [
            c for c in ts_df.columns if any(
                [s in c for s in ['offset50', 'precipitation', 'temperature']])
        ]
        ts_df = resample_dataframe(ts_df, columns, period=period)

    # save as csv
    ts_filename = os.path.join(output_dir, 'time_series.csv')
    print(f'- Saving time series to "{ts_filename}".')
    ts_df.to_csv(ts_filename, index=False)

    # additionally save resampled & detrended time series
    if detrend:
        print('- Detrending time series...')

        # remove seasonality from sub-image time series
        dfs_detrended = detrend_data(dfs, period=period)

        print(
            '- Smoothing vegetation time series after removing seasonlity...')
        dfs_detrended_smooth = smooth_veg_data(dfs_detrended, n=12)

        # combine over sub-images
        ts_df_detrended_smooth = make_time_series(dfs_detrended_smooth)

        # save output
        ts_filename_detrended = os.path.join(output_dir,
                                             'time_series_detrended.csv')
        print(f'- Saving detrended time series to "{ts_filename_detrended}".')
        ts_df_detrended_smooth.to_csv(ts_filename_detrended, index=False)

    return output_dir, dfs  # for now return `dfs` for spatial plot compatibility
    def make_plot(df,
                  veg_prefix,
                  output_dir,
                  veg_prefix_b=None,
                  smoothing_option='smooth'):

        # handle the case where vegetation and precipitation have mismatched NaNs
        veg_df = df.dropna(subset=[veg_prefix + '_offset50_mean'])

        # get vegetation x values to datetime objects
        veg_xs = get_datetime_xs(veg_df)

        # get vegetation y values
        veg_means = veg_df[veg_prefix + '_offset50_mean']
        veg_std = veg_df[veg_prefix + '_offset50_std']

        # create a figure
        fig, ax = plt.subplots(figsize=(15, 4.5))
        plt.xlabel('Time', fontsize=14)

        # set up veg y axis
        color = 'tab:green'
        ax.set_ylabel(f'{veg_prefix} Offset50', color=color, fontsize=14)
        ax.tick_params(axis='y', labelcolor=color)
        ax.set_ylim([
            veg_means.min() - 1 * veg_std.max(),
            veg_means.max() + 3 * veg_std.max()
        ])

        # plot unsmoothed vegetation means
        ax.plot(veg_xs,
                veg_means,
                label='Unsmoothed',
                linewidth=1,
                color='dimgray',
                linestyle='dotted')

        # add smoothed time series if availible
        if any([
                smoothing_option in c and veg_prefix in c
                for c in veg_df.columns
        ]):

            # get smoothed mean, std
            veg_means_smooth = veg_df[veg_prefix + '_offset50_' +
                                      smoothing_option + '_mean']
            veg_stds_smooth = veg_df[veg_prefix + '_offset50_' +
                                     smoothing_option + '_std']

            # plot smoothed vegetation means and std
            ax.plot(veg_xs,
                    veg_means_smooth,
                    marker='o',
                    markersize=7,
                    markeredgecolor=(0.9172, 0.9627, 0.9172),
                    markeredgewidth=2,
                    label='Smoothed',
                    linewidth=2,
                    color='green')

            ax.fill_between(veg_xs,
                            veg_means_smooth - veg_stds_smooth,
                            veg_means_smooth + veg_stds_smooth,
                            facecolor='green',
                            alpha=0.1,
                            label='Std Dev')

        # plot vegetation legend
        plt.legend(loc='upper left')

        # plot precipitation if availible
        if 'total_precipitation' in df.columns:
            # handle the case where vegetation and precipitation have mismatched NaNs
            precip_df = df.dropna(subset=['total_precipitation'])
            precip_ys = precip_df.total_precipitation

            # get precipitation x values to datetime objects
            precip_xs = get_datetime_xs(precip_df)

            # duplicate axis for preciptation
            ax2 = ax.twinx()
            color = 'tab:blue'
            ax2.set_ylabel(f'Precipitation [mm]', color=color, fontsize=14)
            ax2.tick_params(axis='y', labelcolor=color)
            ax2.set_ylim([
                min(precip_ys) - 1 * np.array(precip_ys).std(),
                max(precip_ys) + 2 * np.array(precip_ys).std()
            ])

            # plot precipitation
            ax2.plot(precip_xs,
                     precip_ys,
                     linewidth=2,
                     color=color,
                     alpha=0.75)

            # add veg-precip correlation
            max_corr_smooth, max_corr = get_max_lagged_cor(
                os.path.dirname(output_dir), veg_prefix)
            textstr = f'$r_{{t-{max_corr_smooth[1]}}}={max_corr_smooth[0]:.2f}$ '
            textstr += f'($r_{{t-{max_corr[1]}}}={max_corr[0]:.2f}$ unsmoothed)'

            # old correlation just calculates the 0-lag correlation
            #raw_corr = veg_means.corr(precip_ys)
            #smoothed_corr = veg_means_smooth.corr(precip_ys)
            #textstr = f'$r={smoothed_corr:.2f}$ (${raw_corr:.2f}$ unsmoothed)'
            ax2.text(0.13,
                     0.95,
                     textstr,
                     transform=ax2.transAxes,
                     fontsize=14,
                     verticalalignment='top')

        # plot second vegetation time series if availible
        if veg_prefix_b:

            # handle the case where vegetation and precipitation have mismatched NaNs
            veg_df_b = df.dropna(subset=[veg_prefix_b + '_offset50_mean'])

            # get vegetation x values to datetime objects
            veg_xs_b = get_datetime_xs(veg_df_b)

            # get vegetation y values
            veg_means_b = veg_df_b[veg_prefix_b + '_offset50_mean']
            #veg_std_b = veg_df[veg_prefix_b+'_offset50_std']
            veg_means_smooth_b = veg_df_b[veg_prefix_b +
                                          '_offset50_smooth_mean']
            veg_stds_smooth_b = veg_df_b[veg_prefix_b + '_offset50_smooth_std']

            # plot secondary time series
            ax3 = ax.twinx()
            ax3.spines["left"].set_position(("axes", -0.08))
            ax3.spines["left"].set_visible(True)
            color = 'tab:purple'
            ax3.set_ylabel(veg_prefix_b + ' Offset50',
                           color=color,
                           fontsize=14)
            ax3.tick_params(axis='y', labelcolor=color)
            ax3.yaxis.tick_left()
            ax3.yaxis.set_label_position('left')
            ax3.set_ylim([
                veg_means.min() - 1 * veg_std.max(),
                veg_means.max() + 3 * veg_std.max()
            ])

            # plot unsmoothed vegetation means
            ax.plot(veg_xs_b,
                    veg_means_b,
                    label='Unsmoothed',
                    linewidth=1,
                    color='indigo',
                    linestyle='dashed',
                    alpha=0.2)

            # plot smoothed vegetation means and std
            ax3.plot(veg_xs_b,
                     veg_means_smooth_b,
                     marker='o',
                     markersize=7,
                     markeredgecolor=(0.8172, 0.7627, 0.9172),
                     markeredgewidth=2,
                     label='Smoothed',
                     linewidth=2,
                     color=color)

            ax3.fill_between(veg_xs_b,
                             veg_means_smooth_b - veg_stds_smooth_b,
                             veg_means_smooth_b + veg_stds_smooth_b,
                             facecolor='tab:purple',
                             alpha=0.1,
                             label='Std Dev')

            # add veg-veg correlation
            vegveg_corr = veg_means.corr(veg_means_b)
            vegveg_corr_smooth = veg_means_smooth.corr(veg_means_smooth_b)
            textstr = f'$r_{{vv}}={vegveg_corr_smooth:.2f}$ (${vegveg_corr:.2f}$ unsmoothed)'
            ax2.text(0.55,
                     0.85,
                     textstr,
                     transform=ax2.transAxes,
                     fontsize=14,
                     verticalalignment='top')

            # update prefix for filename use
            veg_prefix = veg_prefix + '+' + veg_prefix_b

        # add autoregression info
        veg_means.index = veg_df.date
        unsmoothed_ar1, unsmoothed_ar1_se = get_AR1_parameter_estimate(
            veg_means)
        if any(['smooth' in c and veg_prefix in c for c in veg_df.columns]):
            veg_means_smooth.index = veg_df.date
            smoothed_ar1, smoothed_ar1_se = get_AR1_parameter_estimate(
                veg_means_smooth)
        else:
            smoothed_ar1, smoothed_ar1_se = np.NaN, np.NaN
        ar1_dict = {}
        ar1_dict['AR1'] = {
            'unsmoothed': {
                'param': unsmoothed_ar1,
                'se': unsmoothed_ar1_se
            },
            'smoothed': {
                'param': smoothed_ar1,
                'se': smoothed_ar1_se
            }
        }
        write_to_json(os.path.join(output_dir, veg_prefix + '_stats.json'),
                      ar1_dict)
        textstr = f'AR$(1)={smoothed_ar1:.2f} \pm {smoothed_ar1_se:.2f}$ (${unsmoothed_ar1:.2f} \pm {unsmoothed_ar1_se:.2f}$ unsmoothed)'
        ax.text(0.55,
                0.95,
                textstr,
                transform=ax.transAxes,
                fontsize=14,
                verticalalignment='top')

        # add Kendall tau
        tau, p = get_kendell_tau(veg_means)
        if any(['smooth' in c and veg_prefix in c for c in veg_df.columns]):
            tau_smooth, p_smooth = get_kendell_tau(veg_means_smooth)
        else:
            tau_smooth, p_smooth = np.NaN, np.NaN

        kendall_tau_dict = {}
        kendall_tau_dict['Kendall_tau'] = {
            'unsmoothed': {
                'tau': tau,
                'p': p
            },
            'smoothed': {
                'tau': tau_smooth,
                'p': p_smooth
            }
        }
        write_to_json(os.path.join(output_dir, veg_prefix + '_stats.json'),
                      kendall_tau_dict)
        textstr = f'$\\tau,~p$-$\\mathrm{{value}}={tau_smooth:.2f}$, ${p:.2f}$ (${tau:.2f}$, ${p_smooth:.2f}$ unsmoothed)'
        ax.text(0.13,
                0.85,
                textstr,
                transform=ax.transAxes,
                fontsize=14,
                verticalalignment='top')

        # layout
        sns.set_style('white')
        fig.tight_layout()

        filename_suffix = '_' + smoothing_option

        # save the plot
        output_filename = veg_prefix + '-time-series' + filename_suffix + '.png'
        plt.savefig(os.path.join(output_dir, output_filename), dpi=DPI)
        plt.close(fig)
Beispiel #4
0
def preprocess_data(
    input_json,
    output_basedir,
    drop_outliers=True,
    fill_missing=True,
    resample=True,
    smoothing=True,
    detrend=True,
    n_smooth=4,
    period="MS",
):
    """
    This function reads and process data downloaded by GEE. Processing
    can be configured by the function arguments. Processed data is
    written to csv.

    Parameters
    ----------
    input_json : dict
       JSON data created during a GEE download job.
    output_basedir : str,
       Directory where time-series csv will be put.
    drop_outliers : bool, optional
        Remove outliers in sub-image time series.
    fill_missing : bool, optional
        Fill missing points in the time series.
    resample : bool, optional
        Resample the time series using linear interpolation.
    smoothing : bool, optional
        Smooth the time series using LOESS smoothing.
    detrend : bool, optional
        Remove seasonal component by subtracting previous year.
    n_smooth : int, optional
        Number of time points to use for the smoothing window size.
    period : str, optional
        Pandas DateOffset string describing sampling frequency.

    Returns
    ----------
    output_dir: str
        Path to the csv file containing processed data.
    defs: dict
        Dictionary of dataframes.
    """

    # put output plots in the results dir
    output_dir = os.path.join(output_basedir, "processed_data")

    # make output subdir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # read dict from json file to dataframes
    dfs = read_json_to_dataframes(input_json)

    # keep track of time points where data is missing (by default pandas
    # groupby operations, which is used haveily in this module, drop NaNs)
    missing = get_missing_time_points(dfs)
    missing_json = {k: list(v) for k, v in missing.items()}
    write_to_json(os.path.join(output_dir, "missing_dates.json"), missing_json)

    print("\nPreprocessing data...")
    print("-" * 21)

    # remove outliers from the time series
    if drop_outliers:
        print("- Dropping vegetation outliers...")
        dfs = drop_veg_outliers(dfs, sigmas=3)

    # use the same month in different years to fill gaps
    if fill_missing:
        print("- Fill gaps in sub-image time series...")
        dfs = fill_veg_gaps(dfs, missing)

    # LOESS smoothing on sub-image time series
    if smoothing:
        print("- Smoothing vegetation time series...")
        dfs = smooth_veg_data(dfs, n=n_smooth)

    # store feature vectors before averaging over sub-images
    print("- Saving feature vectors...")
    store_feature_vectors(dfs, output_dir)

    # average over sub-images
    ts_list = make_time_series(dfs)
    ts_df = ts_list[0]
    if len(ts_list) > 1:
        ts_historic = ts_list[1]
    else:
        ts_historic = pd.DataFrame()

    # resample the averaged time series using linear interpolation
    if resample:
        print("- Resampling time series...")
        columns = [
            c for c in ts_df.columns if any(
                [s in c for s in ["offset50", "precipitation", "temperature"]])
        ]
        ts_df = resample_dataframe(ts_df, columns, period=period)

    #  save as csv
    ts_filename = os.path.join(output_dir, "time_series.csv")
    print(f'- Saving time series to "{ts_filename}".')
    ts_df.to_csv(ts_filename, index=False)
    if not ts_historic.empty:
        ts_filename = os.path.join(output_dir, "time_series_historic.csv")
        print(f'- Saving time series to "{ts_filename}".')
        ts_historic.to_csv(ts_filename, index=False)

    # additionally save resampled & detrended time series
    # this detrending option (one year seasonality substraction) only works in monthly data that has at least 2 years of data
    if detrend and ts_df.shape[0] > 24 and period == 'MS':
        print("- Detrending time series...")

        # remove seasonality from sub-image time series
        dfs_detrended = detrend_data(dfs, period=period)

        print(
            "- Smoothing vegetation time series after removing seasonality...")
        dfs_detrended_smooth = smooth_veg_data(dfs_detrended, n=12)

        # combine over sub-images
        ts_df_detrended_smooth = make_time_series(dfs_detrended_smooth)[0]

        # save output
        ts_filename_detrended = os.path.join(output_dir,
                                             "time_series_detrended.csv")
        print(f'- Saving detrended time series to "{ts_filename_detrended}".')
        ts_df_detrended_smooth.to_csv(ts_filename_detrended, index=False)

    return output_dir, dfs  #  for now return `dfs` for spatial plot compatibility