Ejemplo n.º 1
0
def metconnect_id_loc(sites=None,
                      mc_server='SQL2012PROD03',
                      mc_db='MetConnect',
                      mc_site_table='RainFallPredictionSites',
                      mc_cols=['MetConnectID', 'SiteString', 'TidedaID'],
                      gis_server='SQL2012PROD05'):
    """
    Function to extract the metconnect id table with geometry location.

    Parameters
    ----------
    sites : list of int or None
        The site numbers to extract from the table, or None for all.

    Returns
    -------
    GeoDataFrame
    """

    ### Input parameters
    #    hy_server = 'SQL2012PROD05'
    #    hy_db = 'Hydrotel'
    #    pts_table = 'Points'
    #    objs_table = 'Objects'
    #    sites_table = 'Sites'
    #
    #    pts_cols = ['Point', 'Object']
    #    objs_cols = ['Object', 'Site']
    #    sites_cols = ['Site', 'ExtSysId']

    loc_db = 'Bgauging'
    loc_table = 'RSITES'

    loc_cols = ['SiteNumber', 'NZTMX', 'NZTMY']

    ## Import tables
    mc1 = rd_sql(mc_server, mc_db, mc_site_table, mc_cols)
    mc2 = mc1[~mc1.SiteString.str.startswith('M')]
    mc2.columns = ['MetConnectID', 'site_name', 'ExtSysId']
    mc2 = mc2[(mc2.MetConnectID != 7) & mc2.ExtSysId.notnull()]
    mc2.loc[:, 'ExtSysId'] = mc2.loc[:, 'ExtSysId'].astype(int)

    #    hy_pts = rd_sql(hy_server, hy_db, pts_table, pts_cols, 'Point', mc2.Point.tolist())
    #    hy_objs = rd_sql(hy_server, hy_db, objs_table, objs_cols, 'Object', hy_pts.Object.tolist())
    #    hy_sites = rd_sql(hy_server, hy_db, sites_table, sites_cols, 'Site', hy_objs.Site.tolist())
    #    hy_sites['ExtSysId'] = to_numeric(hy_sites['ExtSysId'])
    hy_loc = rd_sql(gis_server, loc_db, loc_table, loc_cols, 'SiteNumber',
                    mc2.ExtSysId.tolist())
    hy_loc.columns = ['ExtSysId', 'x', 'y']

    #    t1 = merge(mc2, hy_pts, on='Point')
    #    t2 = merge(t1, hy_objs, on='Object')
    #    t3 = merge(t2, hy_sites, on='Site')
    t4 = pd.merge(mc2, hy_loc, on='ExtSysId')

    hy_xy = xy_to_gpd('MetConnectID', 'x', 'y', t4)

    return hy_xy
Ejemplo n.º 2
0
def sel_xy_nc(bound_shp, nc_path, x_col='longitude', y_col='latitude', time_col='time', nc_vars=None, buffer_dis=0,
              from_date=None, to_date=None, nc_crs=4326, out_crs=None, out_type='pandas'):
    """
    Function to select space and time data from a netcdf file using a polygon shapefile.
    """

    ### Process the boundary layer
    bound = gpd.read_file(bound_shp).buffer(buffer_dis).to_crs(convert_crs(nc_crs))
    x_min, y_min, x_max, y_max = bound.unary_union.bounds

    ### Read and extract data from netcdf files
    ds1 = xr.open_dataset(nc_path)
    time1 = pd.to_datetime(ds1[time_col].values)
    if isinstance(from_date, str):
        time1 = time1[time1 >= from_date]
    if isinstance(to_date, str):
        time1 = time1[time1 <= to_date]
    lat1 = ds1[y_col].values
    lon1 = ds1[x_col].values
    lat2 = lat1[(lat1 >= y_min) & (lat1 <= y_max)]
    lon2 = lon1[(lon1 >= x_min) & (lon1 <= x_max)]
    ds2 = ds1.loc[{x_col: lon2, time_col: time1.values, y_col: lat2}]

    #    coords1 = ds2.coords.keys()
    #    dims1 = ds2.dims.keys()

    ## Select mtypes
    if isinstance(nc_vars, str):
        ds3 = ds2[[nc_vars]]
    elif isinstance(nc_vars, (list, np.ndarray, pd.Series)):
        ds3 = ds2[nc_vars]
    elif nc_vars is None:
        ds3 = ds2

    ### Convert to different crs if needed
    if out_crs is not None:
        df1 = ds3.to_dataframe().reset_index()
        xy1 = ds3[[x_col, y_col]].copy()
        xy2 = xy1.to_dataframe().reset_index()
        crs1 = convert_crs(out_crs)
        new_gpd1 = xy_to_gpd(xy2.index, x_col, y_col, xy2, nc_crs)
        new_gpd2 = new_gpd1.to_crs(crs1)
        site_loc2 = xy2.copy()
        site_loc2['x_new'] = new_gpd2.geometry.apply(lambda j: j.x)
        site_loc2['y_new'] = new_gpd2.geometry.apply(lambda j: j.y)

        df2 = pd.merge(df1, site_loc2[[x_col, y_col, 'x_new', 'y_new']], on=[x_col, y_col], how='left')
        df3 = df2.drop([x_col, y_col], axis=1).rename(columns={'x_new': x_col, 'y_new': y_col})
        ds1.close()
        return (df3)
    elif out_type == 'pandas':
        df1 = ds3.to_dataframe().reset_index()
        ds1.close()
        return (df1)
    elif out_type == 'xarray':
        return ds3
Ejemplo n.º 3
0
def rd_nc(poly_shp,
          nc_path,
          poly_epsg=4326,
          poly_id='Station_ID',
          x_col='longitude',
          y_col='latitude',
          data_col='rain',
          as_ts=True,
          export=True,
          export_path='nc_data.csv'):
    """
    Function to read in netCDF files, select locations based on a polygon, and export the results.
    """

    ### Read in all data
    poly = gpd.read_file(poly_shp)[[poly_id,
                                    'geometry']].to_crs(epsg=poly_epsg)
    nc = xr.open_dataset(nc_path)

    ### Filter nc data
    df1 = nc.to_dataframe().drop('time_bnds', axis=1).reset_index()
    df1 = df1[df1.nb2 == 0].drop('nb2', axis=1)

    ### convert x and y to geopandas
    df1_xy = df1[[y_col, x_col]].drop_duplicates()
    df1_xy['id'] = range(len(df1_xy))
    pts = xy_to_gpd('id', x_col, y_col, df1_xy, poly_epsg)

    ### Mask the points from the polygon
    join1, poly2 = pts_poly_join(pts, poly, poly_id)
    join2 = join1[['id', poly_id]]

    ### Select the associated data
    sel_xy = pd.merge(df1_xy, join2, on='id').drop('id', axis=1)
    df2 = pd.merge(df1, sel_xy, on=[y_col, x_col])

    ### Convert to time series
    if as_ts:
        df3 = df2[[poly_id, 'time',
                   data_col]].groupby([poly_id, 'time']).first().reset_index()
        df4 = df3.pivot(index='time', columns=poly_id,
                        values=data_col).round(2)
        if export:
            df4.to_csv(export_path)
    else:
        df4 = df2
        if export:
            df4.to_csv(export_path)

    return df4
Ejemplo n.º 4
0
def rd_niwa_rcp(base_path, mtypes, poly,
                vcsn_sites_csv=r'\\fileservices02\ManagedShares\Data\VirtualClimate\GIS\niwa_vcsn_wgs84.csv',
                id_col='Network', x_col='deg_x', y_col='deg_y', output_fun=None, export_path='output'):
    """
    Function to read in the NIWA RCP netcdf files and output the data in a specified format.
    """

    mtype_name = {'precip': 'TotalPrecipCorr', 'T_max': 'MaxTempCorr', 'T_min': 'MinTempCorr', 'P_atmos': 'MSLP',
                  'PET': 'PE', 'RH_mean': 'RelHum', 'R_s': 'SurfRad', 'U_z': 'WindSpeed'}

    ### Import and reorganize data
    vcsn_sites = pd.read_csv(vcsn_sites_csv)[[id_col, x_col, y_col]]

    sites_gpd = xy_to_gpd(id_col, x_col, y_col, vcsn_sites, 4326)
    poly1 = gpd.read_file(poly)

    sites_gpd2 = sites_gpd.to_crs(poly1.crs)

    mtypes1 = [mtype_name[i] for i in mtypes]

    ### Select sites
    sites_gpd3 = sel_sites_poly(sites_gpd2, poly1)[id_col]
    site_loc1 = vcsn_sites[vcsn_sites[id_col].isin(sites_gpd3)]
    site_loc1.columns = ['id', 'x', 'y']

    ### Read and extract data from netcdf files

    for root, dirs, files in os.walk(base_path):
        files2 = [i for i in files if i.endswith('.nc')]
        files3 = [j for j in files2 if any(j.startswith(i) for i in mtypes1)]
        file_paths1 = [os.path.join(root, i) for i in files3]
        if len(file_paths1) > 0:
            ds = rd_niwa_rcp_dir(file_paths1, site_loc1, mtypes)
            if callable(output_fun):
                new_base_path = root.replace(base_path, export_path)
                base_file_name = file_paths1[0].split('VCSN_')[1]
                if not os.path.exists(new_base_path):
                    os.makedirs(new_base_path)
                output_fun(ds, new_base_path, base_file_name)
                print(base_file_name)
            else:
                raise ValueError('Must have a output function.')
Ejemplo n.º 5
0
def poly_interp_agg(precip,
                    precip_crs,
                    poly,
                    data_col,
                    time_col,
                    x_col,
                    y_col,
                    interp_buffer_dis=10000,
                    poly_buffer_dis=0,
                    grid_res=None,
                    interp_fun='cubic',
                    agg_ts_fun=None,
                    period=None,
                    digits=2,
                    agg_xy=False,
                    nfiles='many',
                    output_path=None):
    """
    Function to select the precip sites within a polygon with a certain buffer distance, then interpolate/resample the data at a specific resolution, then output the results.

    Parameters
    ----------
    precip: DataFrame
        Dataframe of time, x, y, and precip.
    precip_crs: int
        The crs of the x and y coordinates of the precip dataframe.
    poly: GeoDataFrame or str
        str path of a shapefile polygon or a polygon GeoDataFrame.
    interp_buffer_dis: int
        Buffer distance of the polygon selection when performing the interpolation.
    poly_buffer_dis: int
        Buffer distance of the polygon selection when outputting the results.
    grid_res: int
        The resulting grid resolution in meters (or the unit of the final projection).
    interp_fun: str
        The scipy griddata interpolation function to be applied (see https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.interpolate.griddata.html).
    agg_ts_fun: str or None
        The pandas time series resampling function to resample the data in time (either 'mean' or 'sum'). If None, then no time resampling.
    period: str or None
        The pandas time series code to resample the data in time (i.e. '2H' for two hours).
    digits: int
        the number of digits to round to.
    agg_xy: bool
        Should all of the interpolated points within the polygon area be aggregated (mean) to a single time series?
    nfiles: str
        If output_path is a geotiff, then 'one' or 'many' geotiffs to be created.
    output_path: str or None
        Full path string where the output should be stored. The file extension should be one of '.tif' for geotiff, '.nc' for netcdf, or '.csv' for csv.

    Returns
    -------
    DataFrame
    """

    ### Convert x and y of precip to geodataframe
    sites0 = precip[[x_col, y_col]].drop_duplicates().reset_index(drop=True)
    sites = xy_to_gpd(sites0.index,
                      sites0[x_col],
                      sites0[y_col],
                      crs=precip_crs)
    sites.columns = ['site', 'geometry']

    ### Select the locations within the polygon
    if isinstance(poly, (gpd.GeoDataFrame, gpd.GeoSeries)):
        poly1 = poly.copy()
    elif isinstance(poly, str):
        poly1 = gpd.read_file(poly)
    sites1 = sites.to_crs(poly1.crs)
    sites_sel = sel_sites_poly(sites1, poly, interp_buffer_dis)
    sites2 = sites0.loc[sites_sel['site']]

    ### Determine the grid resolution if not set
    if not isinstance(grid_res, (int, float)):
        bounds = poly1.unary_union.bounds
        x_range = bounds[2] - bounds[0]
        y_range = bounds[3] - bounds[1]
        min1 = min([x_range, y_range])
        grid_res = int(np.ceil(min1 / 20))

    ### Select the precip data from the sites
    precip2 = pd.merge(precip, sites2, on=['x', 'y']).dropna()

    ### Interpolate grid
    poly_crs = ['+' + str(i) + '=' + str(poly1.crs[i]) for i in poly1.crs]
    poly_crs1 = ' '.join(poly_crs)
    new_precip = grid_interp_ts(precip2,
                                time_col,
                                x_col,
                                y_col,
                                data_col,
                                grid_res,
                                sites.crs,
                                poly_crs1,
                                interp_fun=interp_fun,
                                agg_ts_fun=agg_ts_fun,
                                period=period,
                                digits=digits)

    ### Create new sites list
    time = new_precip[time_col].sort_values().unique()
    sites_new_df = new_precip.loc[new_precip[time_col] == time[0],
                                  [x_col, y_col, data_col]]
    sites_new = xy_to_gpd(sites_new_df.index.values, x_col, y_col,
                          sites_new_df, poly_crs1)
    sites_new.columns = ['site', 'geometry']
    new_precip['site'] = np.tile(sites_new_df.index.values, len(time))

    ### Select sites from polygon
    sites_sel2 = sel_sites_poly(sites_new, poly, poly_buffer_dis)
    new_precip2 = new_precip.loc[new_precip.site.isin(sites_sel2.site),
                                 [time_col, x_col, y_col, data_col]]

    ### Agg to polygon if required
    if agg_xy:
        new_precip3 = new_precip2.groupby(time_col)[data_col].mean().round(
            digits)
        time_col = None
    else:
        new_precip3 = new_precip2.set_index([time_col, x_col, y_col])[data_col]

    ### Save results
    if isinstance(output_path, str):
        path1 = os.path.splitext(output_path)[0]
        if '.csv' in output_path:
            new_precip3.to_csv(path1 + '.csv', header=True)

        if '.tif' in output_path:
            df = new_precip3.reset_index()
            save_geotiff(df=df,
                         data_col=data_col,
                         crs=poly_crs1,
                         x_col=x_col,
                         y_col=y_col,
                         time_col=time_col,
                         nfiles=nfiles,
                         export_path=path1 + '.tif')

        if '.nc' in output_path:
            ds1 = new_precip3.to_xarray().to_dataset()
            ds1.attrs['spatial_ref'] = poly_crs1
            ds1.to_netcdf(path1 + '.nc')

    return new_precip3
Ejemplo n.º 6
0
def input_processing(precip_et, crs, irr1, paw1, bound_shp, rain_name, pet_name, grid_res, buffer_dis, interp_fun, agg_ts_fun, time_agg, irr_eff_dict, irr_trig_dict, min_irr_area_ratio=0.01, irr_mons=[10, 11, 12, 1, 2, 3, 4], precip_correction=1.1):
    """
    Function to process the input data for the lsrm. Outputs a DataFrame of the variables for the lsrm.
    """
    np.seterr(invalid='ignore')

    ## Load and resample precip and et
    bound = gpd.read_file(bound_shp)

    new_rain = poly_interp_agg(precip_et, crs, bound_shp, rain_name, 'time', 'x', 'y', buffer_dis, grid_res, grid_res, interp_fun=interp_fun, agg_ts_fun=agg_ts_fun, period=time_agg) * precip_correction
    new_rain.name = 'precip'

    new_et = poly_interp_agg(precip_et, crs, bound_shp, pet_name, 'time', 'x', 'y', buffer_dis, grid_res, grid_res, interp_fun=interp_fun, agg_ts_fun=agg_ts_fun, period=time_agg)
    new_et.name = 'pet'

    new_rain_et = pd.concat([new_rain, new_et], axis=1)

    ## convert new point locations to geopandas
    time1 = new_rain_et.index.levels[0][0]
    grid1 = new_rain_et.loc[time1].reset_index()[['x', 'y']]
    grid2 = xy_to_gpd(grid1.index, 'x', 'y', grid1, bound.crs)
    grid2.columns = ['site', 'geometry']

    all_times = new_rain_et.index.levels[0]
    new_rain_et.loc[:, 'site'] = np.tile(grid1.index, len(all_times))

    ## Convert points to polygons
    sites_poly = points_grid_to_poly(grid2, 'site')

    ## process polygon data
    # Select polgons within boundary

    sites_poly_union = sites_poly.unary_union
    irr2 = irr1[irr1.intersects(sites_poly_union)]
    irr3 = irr2[irr2.irr_type.notnull()]
    paw2 = paw1[paw1.intersects(sites_poly_union)]
    paw3 = paw2[paw2.paw.notnull()]

    # Overlay intersection
    sites_poly1 = spatial_overlays(sites_poly, bound, how='intersection')[['site', 'geometry']]
    sites_poly2 = sites_poly1.dissolve('site')
    sites_poly2.crs = sites_poly.crs
    sites_poly_area = sites_poly2.area.round(2)
    sites_poly3 = sites_poly2.reset_index()

    irr4 = spatial_overlays(irr3, sites_poly3, how='intersection')
    paw4 = spatial_overlays(paw3, sites_poly3, how='intersection')

    irr4['area'] = irr4.geometry.area.round()
    irr5 = irr4[irr4.area >= 1].drop(['idx1', 'idx2'], axis=1).copy()

    paw4['area'] = paw4.geometry.area.round()
    paw5 = paw4.loc[(paw4.area >= 1)].drop(['idx1', 'idx2'], axis=1).copy()
    paw5.loc[paw5.paw <= 0, 'paw'] = 1

    # Add in missing PAW values - Change later to something more useful if needed
    mis_sites_index = ~sites_poly3.site.isin(paw5.site)
    sites_poly3['area'] = sites_poly3.area.round()

    paw6 = pd.concat([paw5, sites_poly3[mis_sites_index]])
    paw6.loc[paw6.paw.isnull(), 'paw'] = 1

    # Aggregate by site weighted by area to estimate a volume
    paw_area1 = paw6[['paw', 'site', 'area']].copy()
    paw_area1.loc[:, 'paw_vol'] = paw_area1['paw'] * paw_area1['area']
    paw7 = ((paw_area1.groupby('site')['paw_vol'].sum() / paw_area1.groupby('site')['area'].sum()) * sites_poly_area * 0.001).round(2)

    site_irr_area = irr5.groupby('site')['area'].sum()
    irr_eff1 = irr5.replace({'irr_type': irr_eff_dict})
    irr_eff1.loc[:, 'irr_eff'] = irr_eff1['irr_type'] * irr_eff1['area']
    irr_eff2 = (irr_eff1.groupby('site')['irr_eff'].sum() / site_irr_area).round(3)

    irr_trig1 = irr5.replace({'irr_type': irr_trig_dict})
    irr_trig1.loc[:, 'irr_trig'] = irr_trig1['irr_type'] * irr_trig1['area']
    irr_trig2 = (irr_trig1.groupby('site')['irr_trig'].sum() / site_irr_area).round(3)

    irr_area_ratio1 = (site_irr_area/sites_poly_area).round(3)

    poly_data1 = pd.concat([paw7, sites_poly_area, irr_eff2, irr_trig2, irr_area_ratio1], axis=1)
    poly_data1.columns = ['paw', 'site_area', 'irr_eff', 'irr_trig', 'irr_area_ratio']
    poly_data1.loc[poly_data1['irr_area_ratio'] < min_irr_area_ratio, ['irr_eff', 'irr_trig', 'irr_area_ratio']] = np.nan

    ## Combine time series with polygon data
    new_rain_et1 = new_rain_et[new_rain_et['site'].isin(sites_poly2.index)]

    input1 = pd.merge(new_rain_et1.reset_index(), poly_data1.reset_index(), on='site', how='left')

    ## Convert precip and et to volumes
    input1.loc[:, ['precip', 'pet']] = (input1.loc[:, ['precip', 'pet']].mul(input1.loc[:, 'site_area'], axis=0) * 0.001).round(2)

    ## Remove irrigation parameters during non-irrigation times
    input1.loc[~input1.time.dt.month.isin(irr_mons), ['irr_eff', 'irr_trig']] = np.nan

    ## Run checks on the input data

#    print('Running checks on the prepared input data')

    null_time = input1.loc[input1.time.isnull(), 'time']
    null_x = input1.loc[input1.x.isnull(), 'x']
    null_y = input1.loc[input1.y.isnull(), 'y']
    null_pet = input1.loc[input1['pet'].isnull(), 'pet']
    null_rain = input1.loc[input1['precip'].isnull(), 'precip']
    null_paw = input1.loc[input1.paw.isnull(), 'paw']
    not_null_irr_eff = input1.loc[input1.irr_eff.notnull(), 'irr_eff']

    if not null_time.empty:
        raise ValueError('Null values in the time variable')
    if not null_x.empty:
        raise ValueError('Null values in the x variable')
    if not null_y.empty:
        raise ValueError('Null values in the y variable')
    if not null_pet.empty:
        raise ValueError('Null values in the pet variable')
    if not null_rain.empty:
        raise ValueError('Null values in the rain variable')
    if not null_paw.empty:
        raise ValueError('Null values in the paw variable')
    if not_null_irr_eff.empty:
        raise ValueError('No values for irrigation variables')

    if input1['time'].dtype.name != 'datetime64[ns]':
        raise ValueError('time variable must be a datetime64[ns] dtype')
    if input1['x'].dtype != float:
        raise ValueError('x variable must be a float dtype')
    if input1['y'].dtype != float:
        raise ValueError('y variable must be a float dtype')
    if input1['pet'].dtype != float:
        raise ValueError('pet variable must be a float dtype')
    if input1['precip'].dtype != float:
        raise ValueError('precip variable must be a float dtype')
    if input1['paw'].dtype != float:
        raise ValueError('paw variable must be a float dtype')
    if input1['irr_eff'].dtype != float:
        raise ValueError('irr_eff variable must be a float dtype')
    if input1['irr_trig'].dtype != float:
        raise ValueError('irr_trig variable must be a float dtype')
    if input1['irr_area_ratio'].dtype != float:
        raise ValueError('irr_area_ratio variable must be a float dtype')

    ## Return dict
    return input1, sites_poly2
Ejemplo n.º 7
0
def rd_hdf(self, h5_path):
    """
    Function to read a netcdf file (.nc) that was an export from a hydro class.
    """
    ### Read in base tsdata and attributes
    ## Read in tsdata
    tsdata = pd.read_hdf(h5_path, 'tsdata')
    if 'qual_codes' in tsdata.columns:
        qual_codes = 'qual_codes'
    else:
        qual_codes = None

    ## Read in mfreq
    mfreq = pd.read_hdf(h5_path, 'mfreq').to_dict()

    ## Read in units
    units = pd.read_hdf(h5_path, 'units').to_dict()

    ### Make new Hydro class
    new1 = self.add_tsdata(tsdata.reset_index(),
                           dformat='long',
                           hydro_id='hydro_id',
                           freq_type=mfreq,
                           times='time',
                           sites='site',
                           values='value',
                           units=units,
                           qual_codes=qual_codes)

    ### Read in site attributes
    try:
        site_attr = pd.read_hdf(h5_path, 'site_attr')
        setattr(new1, 'site_attr', site_attr)
    except:
        print('No site attributes.')

    ### Read in geo points
    try:
        geo_point1 = pd.read_hdf(h5_path, 'geo_point')
        geo_point_crs = pd.to_numeric(pd.read_hdf(h5_path, 'geo_point_crs'),
                                      'ignore').to_dict()
        geo_point = xy_to_gpd('site', 'x', 'y', geo_point1,
                              geo_point_crs).set_index('site')
        new1.add_geo_point(geo_point, check=False)
    except:
        print('No geo points.')

    ### Read in geo catch
    try:
        geo_catch1 = pd.read_hdf(h5_path, 'geo_point')
        geo1 = [loads(x) for x in geo_catch1.wkt.values]
        geo_catch_crs = pd.to_numeric(pd.read_hdf(h5_path, 'geo_catch_crs'),
                                      'ignore').to_dict()
        gdf_catch = gpd.GeoDataFrame(geo_catch1.drop('wkt', axis=1),
                                     geometry=geo1,
                                     crs=geo_catch_crs).set_index('site')
        new1.add_geo_point(gdf_catch, check=False)
    except:
        print('No geo catch.')

    return new1
Ejemplo n.º 8
0
def rd_niwa_vcsn(mtypes, sites,
                 nc_path=r'\\fileservices02\ManagedShares\Data\VirtualClimate\vcsn_precip_et_2016-06-06.nc',
                 vcsn_sites_csv=r'\\fileservices02\ManagedShares\Data\VirtualClimate\GIS\niwa_vcsn_wgs84.csv',
                 id_col='Network', x_col='deg_x', y_col='deg_y', buffer_dis=0, include_sites=False, from_date=None,
                 to_date=None, out_crs=None, netcdf_out=None):
    """
    Function to read in the NIWA vcsn netcdf file and output the data as a dataframe.

    mtypes -- A string or list of the measurement types (either 'precip', or 'PET').\n
    sites -- Either a list of vcsn site names or a polygon of the area of interest.\n
    nc_path -- The path to the vcsn nc file.\n
    vcsn_sites_csv -- The csv file that relates the site name to coordinates.\n
    id_col -- The site name column in vcsn_sites_csv.\n
    x_col - The x column name in vcsn_sites_csv.\n
    y_col -- The y column name in vcsn_sites_csv.\n
    include_sites -- Should the site names be added to the output?\n
    out_crs -- The crs epsg number for the output coordinates if different than the default WGS85 (e.g. 2193 for NZTM).
    """

    mtype_name = {'precip': 'rain', 'PET': 'pe'}

    ### Import and reorganize data
    vcsn_sites = pd.read_csv(vcsn_sites_csv)[[id_col, x_col, y_col]]

    if isinstance(sites, str):
        if sites.endswith('.shp'):
            sites_gpd = xy_to_gpd(id_col, x_col, y_col, vcsn_sites, 4326)
            poly1 = gpd.read_file(sites)

            sites_gpd2 = sites_gpd.to_crs(poly1.crs)

            ### Select sites
            sites2 = sel_sites_poly(sites_gpd2, poly1, buffer_dis)[id_col]
    elif isinstance(sites, (list, pd.Series, np.ndarray)):
        sites2 = sites

    ### Select locations
    site_loc1 = vcsn_sites[vcsn_sites[id_col].isin(sites2)]
    site_loc1.columns = ['id', 'x', 'y']

    ### Select mtypes
    if isinstance(mtypes, str):
        mtypes1 = [mtype_name[mtypes]]
    else:
        mtypes1 = [mtype_name[i] for i in mtypes]

    if include_sites:
        mtypes1.extend(['site'])

    ### Read and extract data from netcdf files
    ds1 = xr.open_dataset(nc_path)
    time1 = pd.to_datetime(ds1.time.values)
    if isinstance(from_date, str):
        time1 = time1[time1 >= from_date]
    if isinstance(to_date, str):
        time1 = time1[time1 <= to_date]
    lat1 = ds1.latitude.values
    lon1 = ds1.longitude.values
    lat2 = lat1[np.in1d(lat1, site_loc1.y.unique())]
    lon2 = lon1[np.in1d(lon1, site_loc1.x.unique())]
    ds2 = ds1.loc[{'longitude': lon2, 'time': time1.values, 'latitude': lat2}]
    ds3 = ds2[mtypes1]

    ### Convert to DataFrame
    df1 = ds3.to_dataframe().reset_index()
    df1.rename(columns={'latitude': 'y', 'longitude': 'x'}, inplace=True)
    df1 = df1.dropna()

    ### Convert to different crs if needed
    if out_crs is not None:
        crs1 = convert_crs(out_crs)
        new_gpd1 = xy_to_gpd('id', 'x', 'y', site_loc1, 4326)
        new_gpd2 = new_gpd1.to_crs(crs1)
        site_loc2 = site_loc1.copy()
        site_loc2['x_new'] = new_gpd2.geometry.apply(lambda j: j.x)
        site_loc2['y_new'] = new_gpd2.geometry.apply(lambda j: j.y)

        df2 = pd.merge(df1, site_loc2[['x', 'y', 'x_new', 'y_new']], on=['x', 'y'])
        df3 = df2.drop(['x', 'y'], axis=1).rename(columns={'x_new': 'x', 'y_new': 'y'})
        col_order = ['y', 'x', 'time']
        col_order.extend(mtypes1)
        df4 = df3[col_order]
    else:
        df4 = df1

    ds1.close()
    ds3.close()

    ### Return
    if isinstance(netcdf_out, str):
        ds3.to_netcdf(netcdf_out)
    return df4