Python DataDownloader.download_data 예제들

예제 #1

0

파일 보기

파일: DataPreparation.py 프로젝트: tanxuezhi/alpine-precip-prediction

def areal_model_eval(location,
                     number=None,
                     EDA_average=False,
                     minyear=1979,
                     maxyear=2020):
    """
    Returns data to evaluate an areal model at a given location, area and time period.
    Variables: total precipitation as a function of time, 2m dewpoint temperature, angle of 
    sub-gridscale orography, orography, slope of sub-gridscale orography, total column 
    water vapour, Nino 3.4, Nino 4 and NAO index for a single point.

    Inputs
        number, optional: specify desired ensemble run, integer 
        EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean
        coords [latitude, longitude], optional: specify if you want a specific location, list of floats
        mask, optional: specify area to train model, defaults to Upper Indus Basin

    Outputs
        x_tr: evaluation feature vector, numpy array
        y_tr: evaluation output vector, numpy array
    """
    if number != None:
        da_ensemble = dd.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.sel(number=number).drop("number")
    if EDA_average == True:
        da_ensemble = dd.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.mean(dim="number")
    else:
        da = dd.download_data(location, xarray=True)

    sliced_da = da.sel(time=slice(minyear, maxyear))

    if isinstance(location, str) == True:
        mask_filepath = find_mask(location)
        masked_da = dd.apply_mask(sliced_da, mask_filepath)
        multiindex_df = masked_da.to_dataframe()
        multiindex_df = da.to_dataframe()
        df = multiindex_df.dropna().reset_index()

    else:
        da_location = sliced_da.interp(coords={
            "lat": location[0],
            "lon": location[1]
        },
                                       method="nearest")
        multiindex_df = da_location.to_dataframe()
        df = multiindex_df.dropna().reset_index()

    df["time"] = df["time"] - 1970  # to years
    df["tp"] = log_transform(df["tp"])
    df = df[[
        "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp"
    ]]  #format order

    xtr = df.drop(columns=["tp"]).values
    ytr = df["tp"].values

    return xtr, ytr

예제 #2

0

파일 보기

파일: DataExploration.py 프로젝트: tanxuezhi/alpine-precip-prediction

def tp_vs(mask_filepath, variable, mask=None, longname=""):
    """
    df = dp.download_data(mask_filepath)
    df_var = df[['time','tp', variable]]
    #df_mean = df_var.groupby('time').mean()
    """

    if mask == None:
        df = dd.download_data(mask_filepath)
    else:
        cds_filepath = fd.update_cds_monthly_data()
        da = dd.apply_mask(cds_filepath, mask)
        df = da.to_dataframe().reset_index()

    df = df[["time", "tp", variable]]
    # gilgit = ds.interp(coords={'longitude':74.4584, 'latitude':35.8884 }, method='nearest')
    df_var = df.dropna()

    # Plot
    df_var.plot.scatter(x=variable, y="tp", alpha=0.2, c="b")
    plt.title("Upper Indus Basin")
    plt.ylabel("Total precipitation [m/day]")
    plt.xlabel(longname)
    plt.grid(True)
    plt.show()

예제 #3

0

파일 보기

파일: Correlation.py 프로젝트: tanxuezhi/alpine-precip-prediction

def eof_correlation(eof_filepath, mask_filepath):
    """ Returns plot and DataArray of areas with p<0.05 """

    print("processing precipitation")
    da = dd.download_data(mask_filepath, xarray=True)
    tp_ds = da.mean(dim=["latitude", "longitude"]).tp
    tp = tp_ds.assign_coords(time=(tp_ds.time.astype("datetime64")))
    tp_df = tp.to_dataframe()

    print("processing EOF")
    eof_da = xr.open_dataset(eof_filepath)
    eof_ds = eof_da.EOF
    eof = eof_ds.assign_coords(time=(eof_ds.time.astype("datetime64")))
    eof_df = eof.to_dataframe()
    eof_pv = pd.pivot_table(eof_df,
                            values="EOF",
                            index=["time"],
                            columns=["latitude", "longitude"])
    eof_reset = eof_pv.reset_index()
    eof_reset["time"] -= np.timedelta64(12, "h")

    print("combining")
    df_combined = pd.merge_ordered(tp_df, eof_reset, on="time")
    df_clean = df_combined.dropna()

    corr_s = df_clean.corrwith(df_clean["tp"])
    corr_df = corr_s.to_frame(name="corr")
    corr_df["pvalue"] = pvalue(df_clean)

    filepath = "Data/EOF_corr_pval.csv"
    corr_df.to_csv(filepath)

    return filepath

예제 #4

0

파일 보기

파일: DataExploration.py 프로젝트: tanxuezhi/alpine-precip-prediction

def spatial_autocorr(variable, mask_filepath):  # TODO
    """ Plots spatial autocorrelation """

    df = dd.download_data(mask_filepath)
    # detrend
    table = pd.pivot_table(df,
                           values="tp",
                           index=["latitude", "longitude"],
                           columns=["time"])
    trans_table = table.T
    detrended_table = detrend(trans_table, axis=0)
    corr_table = detrended_table.corr()
    print(corr_table)

    corr_khyber = corr_table.loc[(34.5, 73.0)]
    corr_gilgit = corr_table.loc[(36.0, 75.0)]
    corr_ngari = corr_table.loc[(33.5, 79.0)]

    corr_list = [corr_khyber, corr_gilgit, corr_ngari]

    for corr in corr_list:

        df_reset = corr.reset_index().droplevel(1, axis=1)
        df_pv = df_reset.pivot(index="latitude", columns="longitude")
        df_pv = df_pv.droplevel(0, axis=1)
        da = xr.DataArray(data=df_pv, name="Correlation")

        # Plot

        plt.figure()
        ax = plt.subplot(projection=ccrs.PlateCarree())
        ax.set_extent([71, 83, 30, 38])
        g = da.plot(
            x="longitude",
            y="latitude",
            add_colorbar=True,
            ax=ax,
            vmin=-1,
            vmax=1,
            cmap="coolwarm",
            cbar_kwargs={"pad": 0.10},
        )
        ax.gridlines(draw_labels=True)
        ax.set_xlabel("Longitude")
        ax.set_ylabel("Latitude")

    plt.show()

예제 #5

0

파일 보기

파일: DataExploration.py 프로젝트: tanxuezhi/alpine-precip-prediction

def averaged_timeseries(mask_filepath,
                        variable="tp",
                        longname="Total precipitation [m/day]"):
    """ Timeseries for the Upper Indus Basin"""

    df = dd.download_data(mask_filepath)

    df_var = df[["time", variable]]

    df_var["time"] = df_var["time"].astype(np.datetime64)
    df_mean = df_var.groupby("time").mean()

    df_mean.plot()
    plt.title("Upper Indus Basin")
    plt.ylabel(longname)
    plt.xlabel("Year")
    plt.grid(True)

    plt.show()

예제 #6

0

파일 보기

파일: Timeseries.py 프로젝트: tanxuezhi/alpine-precip-prediction

def uib_sample_linreg():
    """ Plots sample timeseries for UIB clusters """

    # Open data
    mask_filepath = "Data/Masks/ERA5_Upper_Indus_mask.nc"
    tp = dd.download_data(mask_filepath, xarray=True)
    tp_da = tp.tp * 1000  # convert from m/day to mm/day

    ## Data
    gilgit = tp_da.interp(coords={"lon": 75, "lat": 36}, method="nearest")
    ngari = tp_da.interp(coords={"lon": 81, "lat": 32}, method="nearest")
    khyber = tp_da.interp(coords={"lon": 73, "lat": 34.5}, method="nearest")
    timeseries = [gilgit, ngari, khyber]

    gilgit_linear_model = lin_reg(gilgit)
    ngari_linear_model = lin_reg(ngari)
    khyber_linear_model = lin_reg(khyber)
    linear_models = [gilgit_linear_model, ngari_linear_model, khyber_linear_model]

    linreg_plot(timeseries, linear_models)

예제 #7

0

파일 보기

파일: Correlation.py 프로젝트: tanxuezhi/alpine-precip-prediction

def input_correlation_heatmap():

    df = dd.download_data(mask_filepath, all_var=True)

    # create lags
    df["N34-1"] = df["N34"].shift(periods=393)
    df["NAO-1"] = df["NAO"].shift(periods=393)
    df["N4-1"] = df["N4"].shift(periods=393)

    df = df.drop(columns=["time"])
    df_clean = df.dropna()
    df_sorted = df_clean.sort_index(axis=1)
    corr = df_sorted.corr()

    sns.set(style="white")
    plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    mask = np.triu(np.ones_like(
        corr, dtype=np.bool))  # generate a mask for the upper triangle
    sns.heatmap(
        corr,
        mask=mask,
        cmap=cmap,
        center=0,
        vmin=-1,
        vmax=1,
        fmt="0.2f",
        square=True,
        linewidths=0.5,
        annot=True,
        annot_kws={"size": 5},
        cbar_kws={"shrink": 0.5},
    )
    plt.title("Correlation plot for Upper Indus Basin")

    plt.show()

예제 #8

0

파일 보기

파일: DataPreparation.py 프로젝트: tanxuezhi/alpine-precip-prediction

def areal_model(location,
                number=None,
                EDA_average=False,
                length=3000,
                seed=42):
    """
    Outputs test, validation and training data for total precipitation as a function of time, 2m dewpoint temperature,
    angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour,
    Nino 3.4 index for given number randomly sampled data points for a given basin.

    Inputs
        location: specify area to train model
        number, optional: specify desired ensemble run, integer 
        EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean
        length, optional: specify number of points to sample, integer
        seed, optional: specify seed, integer

    Outputs
        x_train: training feature vector, numpy array
        y_train: training output vector, numpy array
        x_test: testing feature vector, numpy array
        y_test: testing output vector, numpy array
    """

    if number != None:
        da_ensemble = dd.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.sel(number=number).drop("number")

    if EDA_average == True:
        da_ensemble = dd.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.mean(dim="number")
    else:
        da = dd.download_data(location, xarray=True)

    # apply mask
    mask_filepath = find_mask(location)
    masked_da = dd.apply_mask(da, mask_filepath)

    multiindex_df = masked_da.to_dataframe()
    df_clean = multiindex_df.dropna().reset_index()
    df = sa.random_location_and_time_sampler(df_clean,
                                             length=length,
                                             seed=seed)

    df["time"] = df["time"] - 1970
    df["tp"] = log_transform(df["tp"])
    df = df[[
        "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp"
    ]]

    # Remove last 10% of time for testing
    test_df = df[df["time"] > df["time"].max() * 0.9]
    xtest = test_df.drop(columns=["tp"]).values
    ytest = test_df["tp"].values

    # Training and validation data
    tr_df = df[df["time"] < df["time"].max() * 0.9]
    xtr = tr_df.drop(columns=["tp"]).values
    ytr = tr_df["tp"].values

    xtrain, xval, ytrain, yval = train_test_split(
        xtr, ytr, test_size=0.30,
        shuffle=False)  # Training and validation data
    """
    # Keep first of 70% for training
    train_df = df[ df['time']< df['time'].max()*0.7]
    xtrain = train_df.drop(columns=['tp']).values
    ytrain = train_df['tp'].values

    # Last 30% for evaluation
    eval_df = df[ df['time']> df['time'].max()*0.7]
    x_eval = eval_df.drop(columns=['tp']).values
    y_eval = eval_df['tp'].values

    # Training and validation data
    xval, xtest, yval, ytest = train_test_split(x_eval, y_eval, test_size=0.3333, shuffle=True)
    """
    return xtrain, xval, xtest, ytrain, yval, ytest

예제 #9

0

파일 보기

파일: DataPreparation.py 프로젝트: tanxuezhi/alpine-precip-prediction

def point_model(location, number=None, EDA_average=False):
    """
    Outputs test, validation and training data for total precipitation as a function of time, 2m dewpoint temperature,
    angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour,
    Nino 3.4, Nino 4 and NAO index for a single point.

    Inputs
        number, optional: specify desired ensemble run, integer
        EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean
        coords [latitude, longitude], optional: specify if you want a specific location, list of floats
        mask_filepath, optional:

    Outputs
        x_train: training feature vector, numpy array
        y_train: training output vector, numpy array
        x_test: testing feature vector, numpy array
        y_test: testing output vector, numpy array
    """
    if number != None:
        da_ensemble = dd.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.sel(number=number).drop("number")
    if EDA_average == True:
        da_ensemble = dd.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.mean(dim="number")
    else:
        da = dd.download_data(location, xarray=True)

    if location is str:
        multiindex_df = da.to_dataframe()
        df_clean = multiindex_df.dropna().reset_index()
        df_location = sa.random_location_sampler(df_clean)
        df = df_location.drop(columns=["lat", "lon", "slor", "anor", "z"])

    else:
        da_location = da.interp(coords={
            "lat": location[0],
            "lon": location[1]
        },
                                method="nearest")
        multiindex_df = da_location.to_dataframe()
        df_clean = multiindex_df.dropna().reset_index()
        df = df_clean.drop(columns=["lat", "lon", "slor", "anor", "z"])

    df["time"] = df["time"] - 1970  # to years
    df["tp"] = log_transform(df["tp"])
    df = df[["time", "d2m", "tcwv", "N34", "tp"]]  #format order

    # Keep first of 70% for training
    train_df = df[df["time"] < df["time"].max() * 0.7]
    xtrain = train_df.drop(columns=["tp"]).values
    ytrain = train_df["tp"].values

    # Last 30% for evaluation
    eval_df = df[df["time"] > df["time"].max() * 0.7]
    x_eval = eval_df.drop(columns=["tp"]).values
    y_eval = eval_df["tp"].values

    # Training and validation data
    xval, xtest, yval, ytest = train_test_split(x_eval,
                                                y_eval,
                                                test_size=0.3333,
                                                shuffle=False)

    return xtrain, xval, xtest, ytrain, yval, ytest

예제 #10

0

파일 보기

파일: Correlation.py 프로젝트: tanxuezhi/alpine-precip-prediction

def cluster_correlation_heatmap():

    masks = ["Khyber_mask.nc", "Gilgit_mask.nc", "Ngari_mask.nc"]
    names = ["Gilgit regime", "Ngari regime", "Khyber regime"]

    for i in range(3):
        cluster_df = dd.download_data(masks[i])

        # create lags
        cluster_df["CGTI-1"] = cluster_df["CGTI"].shift(periods=1)
        cluster_df["CGTI-2"] = cluster_df["CGTI"].shift(periods=2)
        cluster_df["CGTI-3"] = cluster_df["CGTI"].shift(periods=3)
        cluster_df["CGTI-4"] = cluster_df["CGTI"].shift(periods=4)
        cluster_df["CGTI-5"] = cluster_df["CGTI"].shift(periods=5)
        cluster_df["CGTI-6"] = cluster_df["CGTI"].shift(periods=6)
        """'
        df_combined['N34-1'] = df_combined['N34'].shift(periods=1)
        df_combined['N34-2'] = df_combined['N34'].shift(periods=2)
        df_combined['N34-3'] = df_combined['N34'].shift(periods=3)
        df_combined['N34-4'] = df_combined['N34'].shift(periods=4)
        df_combined['N34-5'] = df_combined['N34'].shift(periods=5)
        df_combined['N34-6'] = df_combined['N34'].shift(periods=6)

        df_combined['N4-1'] = df_combined['N4'].shift(periods=1)
        df_combined['N4-2'] = df_combined['N4'].shift(periods=2)
        df_combined['N4-3'] = df_combined['N4'].shift(periods=3)
        df_combined['N4-4'] = df_combined['N4'].shift(periods=4)
        df_combined['N4-5'] = df_combined['N4'].shift(periods=5)
        df_combined['N4-6'] = df_combined['N4'].shift(periods=6)
        """
        df = cluster_df(columns=["expver", "time"])
        df_clean = df.dropna()
        df_sorted = df_clean.sort_index(axis=1)

        # Correlation matrix
        corr = df_sorted.corr()

        # Plot
        sns.set(style="white")

        plt.subplots(figsize=(11, 9))
        cmap = sns.diverging_palette(220, 10, as_cmap=True)

        # Draw the heatmap with the mask and correct aspect ratio
        mask = np.triu(np.ones_like(
            corr, dtype=np.bool))  # generate a mask for the upper triangle
        sns.heatmap(
            corr,
            mask=mask,
            cmap=cmap,
            center=0,
            vmin=-1,
            vmax=1,
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": 0.5},
        )

        plt.title(names[i] + "\n")

    plt.show()

예제 #11

0

파일 보기

import DataDownloader as dd
import Maps as maps

## Filepaths
mask_filepath = "Data/ERA5_Upper_Indus_mask.nc"
dem_filepath = "Data/elev.0.25-deg.nc"

## Function inputs

### Digital Elevation Model data
dem = xr.open_dataset(dem_filepath)
dem_da = (dem.data).sum(dim="time")
sliced_dem = dem_da.sel(lat=slice(38, 30), lon=slice(71.25, 82.75))

### Precipitation data
da = dd.download_data(mask_filepath, xarray=True)
tp_da = da.tp

### Decade list
decades = [1980, 1990, 2000, 2010]

### Cluster list
N = np.arange(2, 11, 1)


def seasonal_clusters(tp_da, sliced_dem, N, decades):
    """
    K-means clustering of precipitation data as a function of seasons, decades and number of clusters.
    Returns spatial graphs, overlayed with the local topography contours.

    Inputs: