Example #1
0
def reject_erroneous_data(r, v, t, y, z, d, fz):
    """
    :param r: reference designator
    :param v: data parameter name
    :param t: time array
    :param y: pressure array
    :param z: data values
    :param d: deployment number
    :param fz: fill values defined in the data file
    :return: filtered data from fill values, NaNs, extreme values '|1e7|' and data outside global ranges
    """

    # reject fill values
    fv_ind = z != fz
    y_nofv = y[fv_ind]
    t_nofv = t[fv_ind]
    z_nofv = z[fv_ind]
    d_nofv = d[fv_ind]
    print(len(z) - len(z_nofv), ' fill values')

    # reject NaNs
    nan_ind = ~np.isnan(z_nofv)
    t_nofv_nonan = t_nofv[nan_ind]
    y_nofv_nonan = y_nofv[nan_ind]
    z_nofv_nonan = z_nofv[nan_ind]
    d_nofv_nonan = d_nofv[nan_ind]
    print(len(z_nofv) - len(z_nofv_nonan), ' NaNs')

    # reject extreme values
    ev_ind = cf.reject_extreme_values(z_nofv_nonan)
    t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
    y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
    z_nofv_nonan_noev = z_nofv_nonan[ev_ind]
    d_nofv_nonan_noev = d_nofv_nonan[ev_ind]
    print(
        len(z_nofv_nonan) - len(z_nofv_nonan_noev), ' Extreme Values', '|1e7|')

    # reject values outside global ranges:
    global_min, global_max = cf.get_global_ranges(r, v)
    if isinstance(global_min,
                  (int, float)) and isinstance(global_max, (int, float)):
        gr_ind = cf.reject_global_ranges(z_nofv_nonan_noev, global_min,
                                         global_max)
        dtime = t_nofv_nonan_noev[gr_ind]
        zpressure = y_nofv_nonan_noev[gr_ind]
        ndata = z_nofv_nonan_noev[gr_ind]
        ndeploy = d_nofv_nonan_noev[gr_ind]
    else:
        gr_ind = []
        dtime = t_nofv_nonan_noev
        zpressure = y_nofv_nonan_noev
        ndata = z_nofv_nonan_noev
        ndeploy = d_nofv_nonan_noev

    print('{} global ranges [{} - {}]'.format(
        len(ndata) - len(z_nofv_nonan_noev), global_min, global_max))

    return dtime, zpressure, ndata, ndeploy
Example #2
0
def plot_timeseries(x, y, y_name, stdev=None):
    """
    Create a simple timeseries plot
    :param x: array containing data for x-axis (e.g. time)
    :param y: .nc data array for plotting on the y-axis, including data values, coordinates, and variable attributes
    :param stdev: desired standard deviation to exclude from plotting
    """

    if type(y) is not np.ndarray:
        yval = y.values
    else:
        yval = y

    if type(x) is not np.ndarray:
        x = x.values

    if stdev is None:
        xD = x
        yD = yval
        leg_text = ()
    else:
        ind = cf.reject_extreme_values(yval)
        ydata = yval[ind]
        xdata = x[ind]

        if len(xdata) > 0:
            ind2 = cf.reject_outliers(ydata, stdev)
            yD = ydata[ind2]
            xD = xdata[ind2]
            outliers = str(len(y) - len(yD))
            leg_text = ('removed {} outliers (SD={})'.format(outliers,
                                                             stdev), )
        else:
            xD = []

    fig, ax = plt.subplots()
    plt.grid()
    if len(xD) > 0:
        plt.plot(xD, yD, '.', markersize=2)

        y_units = get_units(y)

        ax.set_ylabel((y_name + " (" + y_units + ")"), fontsize=9)
        format_date_axis(ax, fig)
        y_axis_disable_offset(ax)
        ax.legend(leg_text, loc='best', fontsize=6)

    return fig, ax
Example #3
0
def plot_timeseries_panel(ds, x, vars, colors, stdev=None):
    """
    Create a timeseries plot with horizontal panels of each science parameter
    :param ds: dataset (e.g. .nc file opened with xarray) containing data for plotting
    :param x: array containing data for x-axis (e.g. time)
    :param vars: list of science variables to plot
    :param colors: list of colors to be used for plotting
    :param stdev: desired standard deviation to exclude from plotting
    """
    fig, ax = plt.subplots(len(vars), sharex=True)

    for i in range(len(vars)):
        y = ds[vars[i]]

        if stdev is None:
            yD = y.values
            xD = x
            leg_text = ()
        else:
            ind = cf.reject_extreme_values(y.values)
            ydata = y[ind]
            xdata = x[ind]

            ind2 = cf.reject_outliers(ydata.values, stdev)
            yD = ydata[ind2].values
            xD = xdata[ind2]
            outliers = str(len(y) - len(yD))
            leg_text = ('{}: rm {} outliers'.format(vars[i], outliers), )

        y_units = get_units(y)
        c = colors[i]
        ax[i].plot(xD, yD, '.', markersize=2, color=c)
        ax[i].set_ylabel(('(' + y_units + ')'), fontsize=5)
        ax[i].tick_params(axis='y', labelsize=6)
        ax[i].legend(leg_text, loc='best', fontsize=4)
        y_axis_disable_offset(ax[i])
        if i == len(vars) - 1:  # if the last variable has been plotted
            format_date_axis(ax[i], fig)

    return fig, ax
Example #4
0
def plot_timeseries_all(x, y, y_name, y_units, stdev=None):
    """
    Create a simple timeseries plot
    :param x: array containing data for x-axis (e.g. time)
    :param y: array containing data for y-axis
    :param stdev: desired standard deviation to exclude from plotting
    """
    if stdev is None:
        xD = x
        yD = y
        leg_text = ()
    else:
        ind = cf.reject_extreme_values(y)
        ydata = y[ind]
        xdata = x[ind]

        ind2 = cf.reject_outliers(ydata, stdev)
        yD = ydata[ind2]
        xD = xdata[ind2]

        # ind2 = cf.reject_outliers(y, stdev)
        # yD = y[ind2]
        # xD = x[ind2]
        outliers = str(len(y) - len(yD))
        leg_text = ('removed {} outliers (SD={})'.format(outliers, stdev), )

    fig, ax = plt.subplots()
    plt.grid()
    plt.plot(xD, yD, '.', markersize=2)

    #plt.ylim([-10, 50])

    ax.set_ylabel((y_name + " (" + y_units + ")"), fontsize=9)
    format_date_axis(ax, fig)
    y_axis_disable_offset(ax)
    ax.legend(leg_text, loc='best', fontsize=6)
    return fig, ax
Example #5
0
def main(sDir, url_list, preferred_only):
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)

    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]

        # filter datasets
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))

        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        main_sensor = r.split('-')[-1]
        fdatasets = cf.filter_collocated_instruments(main_sensor, fdatasets)

        # ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get end times of deployments
        dr_data = cf.refdes_datareview_json(r)
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # # filter datasets
        # datasets = []
        # for u in url_list:
        #     print(u)
        #     splitter = u.split('/')[-2].split('-')
        #     rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
        #     if rd_check == r:
        #         udatasets = cf.get_nc_urls([u])
        #         datasets.append(udatasets)
        # datasets = list(itertools.chain(*datasets))
        # main_sensor = r.split('-')[-1]
        # fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        # fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)

        methodstream = []
        for f in fdatasets:
            methodstream.append('-'.join((f.split('/')[-2].split('-')[-2],
                                          f.split('/')[-2].split('-')[-1])))

        ms_dict = save_dir_path(ms_list)
        for ms in np.unique(methodstream):
            fdatasets_sel = [x for x in fdatasets if ms in x]
            check_ms = ms.split('-')[1]
            if 'recovered' in check_ms:
                check_ms = check_ms.split('_recovered')[0]

            if ms_dict['ms_count'][ms_dict['ms_unique'] == ms.split('-')
                                   [0]] == 1:
                save_dir = os.path.join(sDir, array, subsite, r,
                                        'timeseries_yearly_plot',
                                        ms.split('-')[0])
            else:
                save_dir = os.path.join(sDir, array, subsite, r,
                                        'timeseries_yearly_plot',
                                        ms.split('-')[0], check_ms)
            cf.create_dir(save_dir)

            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)
            print('\nAppending data from files: {}'.format(ms))
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print(fd)
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    try:
                        ds[var]
                        print(var)
                        deployment_num = np.unique(ds['deployment'].values)[0]
                        sh['deployments'] = np.append(sh['deployments'],
                                                      deployment_num)
                        if ds[var].units == sh['db_units']:
                            if ds[var]._FillValue not in sh['fv']:
                                sh['fv'].append(ds[var]._FillValue)
                            if ds[var].units not in sh['units']:
                                sh['units'].append(ds[var].units)
                            tD = ds['time'].values
                            varD = ds[var].values
                            sh['t'] = np.append(sh['t'], tD)
                            sh['values'] = np.append(sh['values'], varD)
                    except KeyError:
                        print('KeyError: ', var)

            print('\nPlotting data')
            for m, n in sci_vars_dict.items():
                for sv, vinfo in n['vars'].items():
                    print(sv)
                    if len(vinfo['t']) < 1:
                        print('no variable data to plot')
                    else:
                        sv_units = vinfo['units'][0]
                        deployments_num = vinfo['deployments']
                        fv = vinfo['fv'][0]
                        t0 = pd.to_datetime(min(
                            vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(
                            vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        x = vinfo['t']
                        y = vinfo['values']

                        # reject NaNs
                        nan_ind = ~np.isnan(y)
                        x_nonan = x[nan_ind]
                        y_nonan = y[nan_ind]

                        # reject fill values
                        fv_ind = y_nonan != vinfo['fv'][0]
                        x_nonan_nofv = x_nonan[fv_ind]
                        y_nonan_nofv = y_nonan[fv_ind]

                        # reject extreme values
                        Ev_ind = cf.reject_extreme_values(y_nonan_nofv)
                        y_nonan_nofv_nE = y_nonan_nofv[Ev_ind]
                        x_nonan_nofv_nE = x_nonan_nofv[Ev_ind]

                        # reject values outside global ranges:
                        global_min, global_max = cf.get_global_ranges(r, sv)
                        print('global ranges: ', global_min, global_max)
                        if global_min and global_max:
                            gr_ind = cf.reject_global_ranges(
                                y_nonan_nofv_nE, global_min, global_max)
                            y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind]
                            x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind]
                        else:
                            y_nonan_nofv_nE_nogr = y_nonan_nofv_nE
                            x_nonan_nofv_nE_nogr = x_nonan_nofv_nE

                        # check array length
                        if len(y_nonan_nofv_nE_nogr) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                                print(var, 'empty array')
                            else:
                                sname = '-'.join((r, m, sv))

                            # group data by year
                            groups, g_data = gt.group_by_time_range(
                                x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr,
                                'A')

                            # create bins
                            # groups_min = min(groups.describe()['DO']['min'])
                            # lower_bound = int(round(groups_min))
                            # groups_max = max(groups.describe()['DO']['max'])
                            # if groups_max < 1:
                            #     upper_bound = 1
                            #     step_bound = 1
                            # else:
                            #     upper_bound = int(round(groups_max + (groups_max / 50)))
                            #     step_bound = int(round((groups_max - groups_min) / 10))
                            #
                            # if step_bound == 0:
                            #     step_bound += 1
                            #
                            # if (upper_bound - lower_bound) == step_bound:
                            #     lower_bound -= 1
                            #     upper_bound += 1
                            # if (upper_bound - lower_bound) < step_bound:
                            #     print('<')
                            #     step_bound = int(round(step_bound / 10))
                            # print(lower_bound, upper_bound, step_bound)
                            # bin_range = list(range(lower_bound, upper_bound, step_bound))
                            # print(bin_range)

                            # preparing color palette
                            colors = color_names[:len(groups)]

                            # colors = [color['color'] for color in
                            #           list(pyplot.rcParams['axes.prop_cycle'][:len(groups)])]

                            fig0, ax0 = pyplot.subplots(nrows=2, ncols=1)

                            # subplot for  histogram and basic statistics table
                            ax0[1].axis('off')
                            ax0[1].axis('tight')
                            the_table = ax0[1].table(
                                cellText=groups.describe().round(2).values,
                                rowLabels=groups.describe().index.year,
                                rowColours=colors,
                                colLabels=groups.describe().columns.levels[1],
                                loc='center')
                            the_table.set_fontsize(5)

                            # subplot for data
                            fig, ax = pyplot.subplots(nrows=len(groups),
                                                      ncols=1,
                                                      sharey=True)
                            if len(groups) == 1:
                                ax = [ax]
                            t = 1
                            for ny in range(len(groups)):
                                # prepare data for plotting
                                y_data = g_data[ny + (t + 1)].dropna(axis=0)
                                x_time = g_data[ny + t].dropna(axis=0)
                                t += 1
                                if len(y_data) != 0 and len(x_time) != 0:
                                    n_year = x_time[0].year

                                    col_name = str(n_year)

                                    serie_n = pd.DataFrame(columns=[col_name],
                                                           index=x_time)
                                    serie_n[col_name] = list(y_data[:])

                                    # plot histogram
                                    # serie_n.plot.hist(ax=ax0[0], bins=bin_range,
                                    #                   histtype='bar', color=colors[ny], stacked=True)

                                    if len(serie_n) != 1:
                                        serie_n.plot.kde(ax=ax0[0],
                                                         color=colors[ny])
                                        ax0[0].legend(fontsize=8,
                                                      bbox_to_anchor=(0., 1.12,
                                                                      1.,
                                                                      .102),
                                                      loc=3,
                                                      ncol=len(groups),
                                                      mode="expand",
                                                      borderaxespad=0.)

                                        # ax0[0].set_xticks(bin_range)
                                        ax0[0].set_xlabel('Observation Ranges',
                                                          fontsize=8)
                                        ax0[0].set_ylabel(
                                            'Density', fontsize=8
                                        )  #'Number of Observations'
                                        ax0[0].set_title(
                                            ms.split('-')[0] + ' (' + sv +
                                            ', ' + sv_units + ')' +
                                            '  Kernel Density Estimates',
                                            fontsize=8)

                                        # plot data
                                        serie_n.plot(ax=ax[ny],
                                                     linestyle='None',
                                                     marker='.',
                                                     markersize=0.5,
                                                     color=colors[ny])
                                        ax[ny].legend().set_visible(False)

                                        # plot Mean and Standard deviation
                                        ma = serie_n.rolling('86400s').mean()
                                        mstd = serie_n.rolling('86400s').std()

                                        ax[ny].plot(ma.index,
                                                    ma[col_name].values,
                                                    'k',
                                                    linewidth=0.15)
                                        ax[ny].fill_between(
                                            mstd.index,
                                            ma[col_name].values -
                                            2 * mstd[col_name].values,
                                            ma[col_name].values +
                                            2 * mstd[col_name].values,
                                            color='b',
                                            alpha=0.2)

                                        # prepare the time axis parameters
                                        datemin = datetime.date(n_year, 1, 1)
                                        datemax = datetime.date(n_year, 12, 31)
                                        ax[ny].set_xlim(datemin, datemax)
                                        xlocator = mdates.MonthLocator(
                                        )  # every month
                                        myFmt = mdates.DateFormatter('%m')
                                        ax[ny].xaxis.set_minor_locator(
                                            xlocator)
                                        ax[ny].xaxis.set_major_formatter(myFmt)

                                        # prepare the time axis parameters
                                        # ax[ny].set_yticks(bin_range)
                                        ylocator = MaxNLocator(prune='both',
                                                               nbins=3)
                                        ax[ny].yaxis.set_major_locator(
                                            ylocator)

                                        # format figure
                                        ax[ny].tick_params(axis='both',
                                                           color='r',
                                                           labelsize=7,
                                                           labelcolor='m')

                                        if ny < len(groups) - 1:
                                            ax[ny].tick_params(
                                                which='both',
                                                pad=0.1,
                                                length=1,
                                                labelbottom=False)
                                            ax[ny].set_xlabel(' ')
                                        else:
                                            ax[ny].tick_params(which='both',
                                                               color='r',
                                                               labelsize=7,
                                                               labelcolor='m',
                                                               pad=0.1,
                                                               length=1,
                                                               rotation=0)
                                            ax[ny].set_xlabel('Months',
                                                              rotation=0,
                                                              fontsize=8,
                                                              color='b')

                                        ax[ny].set_ylabel(n_year,
                                                          rotation=0,
                                                          fontsize=8,
                                                          color='b',
                                                          labelpad=20)
                                        ax[ny].yaxis.set_label_position(
                                            "right")

                                        if ny == 0:
                                            if global_min and global_max:

                                                ax[ny].set_title(
                                                    sv + '( ' + sv_units +
                                                    ') -- Global Range: [' +
                                                    str(int(global_min)) +
                                                    ',' +
                                                    str(int(global_max)) +
                                                    '] \n'
                                                    'Plotted: Data, Mean and 2STD (Method: One day rolling window calculations) \n',
                                                    fontsize=8)
                                            else:
                                                ax[ny].set_title(
                                                    sv + '( ' + sv_units +
                                                    ') -- Global Range: [] \n'
                                                    'Plotted: Data, Mean and 2STD (Method: One day rolling window calculations) \n',
                                                    fontsize=8)

                                        # plot global ranges
                                        # ax[ny].axhline(y=global_min, color='r', linestyle='--', linewidth=.6)
                                        # ax[ny].axhline(y=global_max, color='r', linestyle='--', linewidth=.6)

                                        # mark deployment end times on figure
                                        ymin, ymax = ax[ny].get_ylim()
                                        #dep = 1
                                        for etimes in range(len(end_times)):
                                            if end_times[
                                                    etimes].year == n_year:
                                                ax[ny].axvline(
                                                    x=end_times[etimes],
                                                    color='b',
                                                    linestyle='--',
                                                    linewidth=.6)
                                                ax[ny].text(
                                                    end_times[etimes],
                                                    ymin,
                                                    'End' +
                                                    str(deployments_num[etimes]
                                                        ),
                                                    fontsize=6,
                                                    style='italic',
                                                    bbox=dict(boxstyle='round',
                                                              ec=(0., 0.5,
                                                                  0.5),
                                                              fc=(1., 1., 1.)))
                                        #    dep += 1

                                        # ax[ny].set_ylim(5, 12)

                                    # save figure to a file
                                    sfile = '_'.join(('all', sname))
                                    save_file = os.path.join(save_dir, sfile)
                                    fig.savefig(str(save_file), dpi=150)

                                    sfile = '_'.join(('Statistics', sname))
                                    save_file = os.path.join(save_dir, sfile)
                                    fig0.savefig(str(save_file), dpi=150)

                                    pyplot.close()
Example #6
0
def plot_xsection(subsite,
                  x,
                  y,
                  z,
                  clabel,
                  ylabel,
                  t_eng=None,
                  m_water_depth=None,
                  inpercentile=None,
                  stdev=None):
    """
    Create a cross-section plot for mobile instruments
    :param subsite: subsite part of reference designator to plot
    :param x:  array containing data for x-axis (e.g. time)
    :param y: .nc data array containing data for plotting on the y-axis (e.g. pressure)
    :param z: .nc data array containing data for plotting variable of interest (e.g. density)
    :param clabel: label for the colorbar
    :param ylabel: label for the y-axis
    :param t_eng: .nc data array containing engineering timestamps (to plot water depth)
    :param m_water_depth: .nc data array containing water depth data from the engineering data stream
    :param inpercentile: percentile of data to exclude from plot
    :param stdev: desired standard deviation to exclude from plotting
    """
    if type(z) is not np.ndarray:
        z = z.values

    if type(y) is not np.ndarray:
        y = y.values

    if type(x) is not np.ndarray:
        x = x.values

    # when plotting gliders, remove zeros (glider fill values) and negative numbers
    if 'MOAS' in subsite:
        z[z <= 0.0] = np.nan
        zeros = str(len(z) - np.count_nonzero(~np.isnan(z)))

    if stdev is None:
        xD = x
        yD = y
        zD = z
    else:
        ind = cf.reject_extreme_values(z)
        xdata = x[ind]
        ydata = y[ind]
        zdata = z[ind]

        ind2 = cf.reject_outliers(zdata, stdev)
        xD = xdata[ind2]
        yD = ydata[ind2]
        zD = zdata[ind2]
        outliers = str(len(zdata) - len(zD))

    try:
        zeros
    except NameError:
        zeros = None

    try:
        outliers
    except NameError:
        outliers = None

    fig, ax = plt.subplots()
    plt.margins(y=.08, x=.02)
    try:
        xc = ax.scatter(xD, yD, c=zD, s=2, edgecolor='None')
        #plt.ylim([0, 100])
        ax.invert_yaxis()

        # add bathymetry for coastal gliders
        if t_eng is not None and m_water_depth is not None:
            if len(t_eng) > 1:
                ax.fill_between(t_eng,
                                m_water_depth,
                                np.max(m_water_depth) + 2,
                                facecolor='k',
                                alpha=0.4)

        # add color bar
        #ticks = np.linspace(np.nanmin(zD), np.nanmax(zD), 5).tolist()
        bar = fig.colorbar(xc, ax=ax, label=clabel, extend='both')
        bar.formatter.set_useOffset(False)
        bar.ax.tick_params(labelsize=8)

        if inpercentile is not None:
            upper_lim = np.percentile(zD, 100 - inpercentile)
            # upper_mid = np.percentile(zD, 100 - 15*inpercentile)
            # lower_mid = np.percentile(zD, 100 - 10*inpercentile)
            lower_lim = np.percentile(zD, inpercentile)
            bar.set_clim(lower_lim, upper_lim)
            bar.set_ticks([lower_lim, upper_lim],
                          update_ticks=True)  #lower_mid, upper_mid,

        ax.set_ylabel(ylabel, fontsize=9)
        format_date_axis(ax, fig)

        if zeros is None and type(outliers) is str:
            leg = ('rm: {} outliers (SD={})'.format(outliers, stdev), )
            ax.legend(leg, loc=1, fontsize=6)
        if type(zeros) is str and outliers is None:
            leg = ('rm: {} values <=0.0'.format(zeros), )
            ax.legend(leg, loc=1, fontsize=6)
        if type(zeros) is str and type(outliers) is str:
            leg = ('rm: {} values <=0.0, rm: {} outliers (SD={})'.format(
                zeros, outliers, stdev), )
            ax.legend(leg, loc=1, fontsize=6)
    except ValueError:
        print("plot can't be generated")
        fig = None
        ax = None
        bar = None

    return fig, ax, bar
Example #7
0
def plot_timeseries_compare(t0, t1, var0, var1, m0, m1, long_name, stdev=None):
    """
    Create a timeseries plot containing two datasets
    :param t0: data array of time for dataset 0
    :param t1: data array of time for dataset 1
    :param var0: .nc data array for plotting on the y-axis for dataset 0, including data values and variable attributes
    :param var1: .nc data array for plotting on the y-axis for dataset 1, including data values and variable attributes
    :param stdev: desired standard deviation to exclude from plotting
    """
    if stdev is None:
        t0_data = t0.values
        var0_data = var0.values
        leg_text = ('{}'.format(m0), )
        t1_data = t1.values
        var1_data = var1.values
        leg_text += ('{}'.format(m1), )
    else:
        ind0 = cf.reject_extreme_values(var0.values)
        t0i = t0[ind0]
        var0i = var0[ind0]

        ind02 = cf.reject_outliers(var0i.values, stdev)
        t0_data = t0i[ind02].values
        var0_data = var0i[ind02].values
        #var0_data[var0_data <= 0.0] = np.nan  # get rid of zeros and negative numbers
        outliers0 = str((len(var0) - len(var0_data)) +
                        (len(t0_data) -
                         np.count_nonzero(~np.isnan(var0_data))))
        leg_text = ('{}: removed {} outliers (SD={})'.format(
            m0, outliers0, stdev), )

        ind1 = cf.reject_extreme_values(var1.values)
        t1i = t1[ind1]
        var1i = var1[ind1]

        ind12 = cf.reject_outliers(var1i.values, stdev)
        t1_data = t1i[ind12].values
        var1_data = var1i[ind12].values
        #var1_data[var1_data <= 0.0] = np.nan  # get rid of zeros and negative numbers
        outliers1 = str((len(var1) - len(var1_data)) +
                        (len(t1_data) -
                         np.count_nonzero(~np.isnan(var1_data))))
        leg_text += ('{}: removed {} outliers (SD={})'.format(
            m1, outliers1, stdev), )

    y_units = get_units(var0)

    fig, ax = plt.subplots()
    plt.grid()
    #plt.ylim([2000, 2500])

    ax.plot(t0_data,
            var0_data,
            'o',
            markerfacecolor='none',
            markeredgecolor='r',
            markersize=5,
            lw=.75)
    #ax.plot(t1_data, var1_data, 'x', markeredgecolor='b', markersize=5, lw=.75)
    ax.plot(t1_data, var1_data, '.', markeredgecolor='b', markersize=2)
    ax.set_ylabel((long_name + " (" + y_units + ")"), fontsize=9)
    format_date_axis(ax, fig)
    y_axis_disable_offset(ax)
    ax.legend(leg_text, loc='best', fontsize=6)
    return fig, ax
Example #8
0
def main(sDir, url_list, start_time, end_time):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)
        for index, row in ps_df.iterrows():
            for ii in range(n_streams):
                try:
                    rms = '-'.join((r, row[ii]))
                except TypeError:
                    continue
                for dd in datasets:
                    spl = dd.split('/')[-2].split('-')
                    catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                    fdeploy = dd.split('/')[-1].split('_')[0]
                    if rms == catalog_rms and fdeploy == row['deployment']:
                        fdatasets.append(dd)

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(main_sensor, fdatasets)

        # get science variable long names from the Data Review Database
        #stream_sci_vars = cd.sci_var_long_names(r)
        if 'SPKIR' in r or 'PRESF' in r:  # only get the main science variable for SPKIR
            stream_vars = cd.sci_var_long_names(r)
        else:
            stream_vars = var_long_names(r)

        # check if the science variable long names are the same for each stream and initialize empty arrays
        sci_vars_dict = cd.sci_var_long_names_check(stream_vars)

        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # build dictionary of science data from the preferred dataset for each deployment
        print('\nAppending data from files')
        et = []
        sci_vars_dict, __, __ = cd.append_science_data(ps_df, n_streams, r, fdatasets_sel, sci_vars_dict, et, start_time, end_time)

        # get end times of deployments
        dr_data = cf.refdes_datareview_json(r)
        deployments = []
        dend_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            dend_times.append(pd.to_datetime(deploy_info['stop_date']))

        subsite = r.split('-')[0]
        array = subsite[0:2]
        save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_plots_preferred_all')
        cf.create_dir(save_dir)

        print('\nPlotting data')
        for m, n in sci_vars_dict.items():
            for sv, vinfo in n['vars'].items():
                print(sv)
                if 'SPKIR' in r:
                    fv_lst = np.unique(vinfo['fv']).tolist()
                    if len(fv_lst) == 1:
                        fill_value = fv_lst[0]
                    else:
                        print(fv_lst)
                        print('No unique fill value for {}'.format(sv))

                    sv_units = np.unique(vinfo['units']).tolist()

                    t = vinfo['t']
                    if len(t) > 1:
                        data = vinfo['values']
                        [dd_data, g_min, g_max] = index_dataset_2d(r, 'spkir_abj_cspp_downwelling_vector', data, fill_value)
                        t0 = pd.to_datetime(min(t)).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(t)).strftime('%Y-%m-%dT%H:%M:%S')
                        deploy_final = vinfo['deployments']
                        deploy = list(np.unique(deploy_final))
                        deployments = [int(dd) for dd in deploy]

                        sname = '-'.join((r, sv))
                        fig, ax = pf.plot_spkir(t, dd_data, sv, sv_units[0])
                        ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n'
                                      + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8)
                        for etimes in dend_times:
                            ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6)
                        pf.save_fig(save_dir, sname)

                        # plot each wavelength
                        wavelengths = ['412nm', '443nm', '490nm', '510nm', '555nm', '620nm', '683nm']
                        for wvi in range(len(dd_data)):
                            fig, ax = pf.plot_spkir_wv(t, dd_data[wvi], sv, sv_units[0], wvi)
                            ax.set_title(
                                (r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n'
                                 + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8)
                            for etimes in dend_times:
                                ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6)
                            snamewvi = '-'.join((sname, wavelengths[wvi]))
                            pf.save_fig(save_dir, snamewvi)

                elif 'presf_abc_wave_burst' in m:
                    fv_lst = np.unique(vinfo['fv']).tolist()
                    if len(fv_lst) == 1:
                        fill_value = fv_lst[0]
                    else:
                        print(fv_lst)
                        print('No unique fill value for {}'.format(sv))

                    sv_units = np.unique(vinfo['units']).tolist()

                    t = vinfo['t']
                    if len(t) > 1:
                        data = vinfo['values']
                        [dd_data, g_min, g_max] = index_dataset_2d(r, 'presf_wave_burst_pressure', data, fill_value)
                        t0 = pd.to_datetime(min(t)).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(t)).strftime('%Y-%m-%dT%H:%M:%S')
                        deploy_final = vinfo['deployments']
                        deploy = list(np.unique(deploy_final))
                        deployments = [int(dd) for dd in deploy]

                        sname = '-'.join((r, sv))
                        fig, ax = pf.plot_presf_2d(t, dd_data, sv, sv_units[0])
                        ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n'
                                      + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8)
                        for etimes in dend_times:
                            ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6)
                        pf.save_fig(save_dir, sname)

                else:
                    if type(vinfo['values']) != dict:  # if the variable is not a 2D array
                        if 'Spectra' not in sv:
                            if len(vinfo['t']) < 1:
                                print('no variable data to plot')
                            else:
                                sv_units = vinfo['units'][0]
                                sv_name = vinfo['var_name']
                                t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                                t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                                x = vinfo['t']
                                y = vinfo['values']

                                # reject NaNs and values of 0.0
                                nan_ind = (~np.isnan(y)) & (y != 0.0)
                                x_nonan = x[nan_ind]
                                y_nonan = y[nan_ind]

                                # reject fill values
                                fv_ind = y_nonan != vinfo['fv'][0]
                                x_nonan_nofv = x_nonan[fv_ind]
                                y_nonan_nofv = y_nonan[fv_ind]

                                # reject extreme values
                                Ev_ind = cf.reject_extreme_values(y_nonan_nofv)
                                y_nonan_nofv_nE = y_nonan_nofv[Ev_ind]
                                x_nonan_nofv_nE = x_nonan_nofv[Ev_ind]

                                # reject values outside global ranges:
                                global_min, global_max = cf.get_global_ranges(r, sv_name)
                                if any(e is None for e in [global_min, global_max]):
                                    y_nonan_nofv_nE_nogr = y_nonan_nofv_nE
                                    x_nonan_nofv_nE_nogr = x_nonan_nofv_nE
                                else:
                                    gr_ind = cf.reject_global_ranges(y_nonan_nofv_nE, global_min, global_max)
                                    y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind]
                                    x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind]

                                if len(y_nonan_nofv) > 0:
                                    if m == 'common_stream_placeholder':
                                        sname = '-'.join((r, sv))
                                    else:
                                        sname = '-'.join((r, m, sv))

                                    plt_deploy = [int(x) for x in list(np.unique(vinfo['deployments']))]

                                    # plot hourly averages for cabled and FDCHP data
                                    if 'streamed' in sci_vars_dict[list(sci_vars_dict.keys())[0]]['ms'][0] or 'FDCHP' in r:
                                        sname = '-'.join((sname, 'hourlyavg'))
                                        df = pd.DataFrame({'dfx': x_nonan_nofv_nE_nogr, 'dfy': y_nonan_nofv_nE_nogr})
                                        dfr = df.resample('H', on='dfx').mean()

                                        # Plot all data
                                        fig, ax = pf.plot_timeseries_all(dfr.index, dfr['dfy'], sv, sv_units, stdev=None)
                                        ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1),
                                                     fontsize=8)

                                        # if plotting a specific time range, plot deployment lines only for those deployments
                                        if type(start_time) == dt.datetime:
                                            for e in list(np.unique(vinfo['deployments'])):
                                                etime = dend_times[int(e) - 1]
                                                ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        else:
                                            for etime in dend_times:
                                                ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        pf.save_fig(save_dir, sname)
                                    else:
                                        # Plot all data
                                        fig, ax = pf.plot_timeseries_all(x_nonan_nofv, y_nonan_nofv, sv, sv_units, stdev=None)
                                        ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1),
                                                     fontsize=8)

                                        # if plotting a specific time range, plot deployment lines only for those deployments
                                        if type(start_time) == dt.datetime:
                                            # for e in list(np.unique(vinfo['deployments'])):
                                            #     etime = dend_times[int(e) - 1]
                                            #     ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                            etime = dend_times[int(list(np.unique(vinfo['deployments']))[0]) - 1]
                                            ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        else:
                                            for etime in dend_times:
                                                ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        # if not any(e is None for e in [global_min, global_max]):
                                        #     ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6)
                                        #     ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6)
                                        # else:
                                        #     maxpoint = x[np.argmax(y_nonan_nofv)], max(y_nonan_nofv)
                                        #     ax.annotate('No Global Ranges', size=8,
                                        #                 xy=maxpoint, xytext=(5, 5), textcoords='offset points')
                                        pf.save_fig(save_dir, sname)

                                        # Plot data with outliers removed
                                        fig, ax = pf.plot_timeseries_all(x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, sv, sv_units,
                                                                         stdev=5)
                                        ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1),
                                                     fontsize=8)

                                        # if plotting a specific time range, plot deployment lines only for those deployments
                                        if type(start_time) == dt.datetime:
                                            # for e in list(np.unique(vinfo['deployments'])):
                                            #     etime = dend_times[int(e) - 1]
                                            #     ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                            etime = dend_times[int(list(np.unique(vinfo['deployments']))[0]) - 1]
                                            ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        else:
                                            for etime in dend_times:
                                                ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        # if not any(e is None for e in [global_min, global_max]):
                                        #     ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6)
                                        #     ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6)
                                        # else:
                                        #     maxpoint = x[np.argmax(y_nonan_nofv_nE_nogr)], max(y_nonan_nofv_nE_nogr)
                                        #     ax.annotate('No Global Ranges', size=8,
                                        #                 xy=maxpoint, xytext=(5, 5), textcoords='offset points')

                                        sfile = '_'.join((sname, 'rmoutliers'))
                                        pf.save_fig(save_dir, sfile)
def main(sDir, ncdir):
    rd_list = [ncdir.split('/')[-2]]

    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get end times of deployments
        dr_data = cf.refdes_datareview_json(r)
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # filter datasets
        fdatasets = []
        for root, dirs, files in os.walk(ncdir):
            for f in files:
                if f.endswith('.nc'):
                    fdatasets.append(f)
        # for u in url_list:
        #     splitter = u.split('/')[-2].split('-')
        #     rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
        #     if rd_check == r:
        #         udatasets = cf.get_nc_urls([u])
        #         datasets.append(udatasets)
        # datasets = list(itertools.chain(*datasets))
        # main_sensor = r.split('-')[-1]
        # fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        methodstream = []
        for f in fdatasets:
            strm = '_'.join((f.split('-')[-2].split('_')[0], f.split('-')[-2].split('_')[1]))
            methodstream.append('-'.join((f.split('-')[-3], strm)))

        for ms in np.unique(methodstream):
            fdatasets_sel = [x for x in fdatasets if ms in x]
            save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_plots_all')
            cf.create_dir(save_dir)

            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update({y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms)
            print('\nAppending data from files: {}'.format(ms))
            for fd in fdatasets_sel:
                ds = xr.open_dataset(os.path.join(ncdir, fd), mask_and_scale=False)
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)
                        tD = ds['time'].values
                        varD = ds[var].values
                        sh['t'] = np.append(sh['t'], tD)
                        sh['values'] = np.append(sh['values'], varD)

            print('\nPlotting data')
            for m, n in sci_vars_dict.items():
                for sv, vinfo in n['vars'].items():
                    print(sv)
                    if len(vinfo['t']) < 1:
                        print('no variable data to plot')
                    else:
                        sv_units = vinfo['units'][0]
                        t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        x = vinfo['t']
                        y = vinfo['values']

                        # reject NaNs
                        nan_ind = ~np.isnan(y)
                        x_nonan = x[nan_ind]
                        y_nonan = y[nan_ind]

                        # reject fill values
                        fv_ind = y_nonan != vinfo['fv'][0]
                        x_nonan_nofv = x_nonan[fv_ind]
                        y_nonan_nofv = y_nonan[fv_ind]

                        # reject extreme values
                        Ev_ind = cf.reject_extreme_values(y_nonan_nofv)
                        y_nonan_nofv_nE = y_nonan_nofv[Ev_ind]
                        x_nonan_nofv_nE = x_nonan_nofv[Ev_ind]

                        # reject values outside global ranges:
                        global_min, global_max = cf.get_global_ranges(r, sv)
                        if global_min is not None and global_max is not None:
                            gr_ind = cf.reject_global_ranges(y_nonan_nofv_nE, global_min, global_max)
                            y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind]
                            x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind]
                        else:
                            y_nonan_nofv_nE_nogr = y_nonan_nofv_nE
                            x_nonan_nofv_nE_nogr = x_nonan_nofv_nE

                        title = ' '.join((r, ms.split('-')[0]))

                        if len(y_nonan_nofv) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))

                            # Plot all data
                            fig, ax = pf.plot_timeseries_all(x_nonan_nofv, y_nonan_nofv, sv, sv_units, stdev=None)
                            ax.set_title((title + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,  color='b', linestyle='--', linewidth=.6)

                            # if global_min is not None and global_max is not None:
                            #     ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6)
                            #     ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6)

                            pf.save_fig(save_dir, sname)

                            # Plot data with extreme values, data outside global ranges and outliers removed
                            fig, ax = pf.plot_timeseries_all(x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, sv, sv_units, stdev=5)
                            ax.set_title((title + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,  color='b', linestyle='--', linewidth=.6)

                            # if global_min is not None and global_max is not None:
                            #     ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6)
                            #     ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6)

                            sfile = '_'.join((sname, 'rmoutliers'))
                            pf.save_fig(save_dir, sfile)
def main(sDir, url_list):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get end times of deployments
        dr_data = cf.refdes_datareview_json(r)
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # filter datasets
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        main_sensor = r.split('-')[-1]
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        methodstream = []
        for f in fdatasets:
            methodstream.append('-'.join((f.split('/')[-2].split('-')[-2], f.split('/')[-2].split('-')[-1])))

        for ms in np.unique(methodstream):
            fdatasets_sel = [x for x in fdatasets if ms in x]

            check_ms = ms.split('-')[1]
            if 'recovered' in check_ms:
                check_ms = check_ms.split('_recovered')

            save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_monthly_plot',
                                    check_ms[0], ms.split('-')[0])
            cf.create_dir(save_dir)

            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update({y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms)
            print('\nAppending data from files: {}'.format(ms))
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)
                        tD = ds['time'].values
                        varD = ds[var].values
                        sh['t'] = np.append(sh['t'], tD)
                        sh['values'] = np.append(sh['values'], varD)

            print('\nPlotting data')
            for m, n in sci_vars_dict.items():
                for sv, vinfo in n['vars'].items():
                    print(sv)
                    if len(vinfo['t']) < 1:
                        print('no variable data to plot')
                    else:
                        sv_units = vinfo['units'][0]
                        t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        x = vinfo['t']
                        y = vinfo['values']

                        # reject NaNs
                        nan_ind = ~np.isnan(y)
                        x_nonan = x[nan_ind]
                        y_nonan = y[nan_ind]

                        # reject fill values
                        fv_ind = y_nonan != vinfo['fv'][0]
                        x_nonan_nofv = x_nonan[fv_ind]
                        y_nonan_nofv = y_nonan[fv_ind]

                        # reject extreme values
                        Ev_ind = cf.reject_extreme_values(y_nonan_nofv)
                        y_nonan_nofv_nE = y_nonan_nofv[Ev_ind]
                        x_nonan_nofv_nE = x_nonan_nofv[Ev_ind]

                        # reject values outside global ranges:
                        global_min, global_max = cf.get_global_ranges(r, sv)
                        gr_ind = cf.reject_global_ranges(y_nonan_nofv_nE, global_min, global_max)
                        y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind]
                        x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind]

                        title = ' '.join((r, ms.split('-')[0]))

                        if len(y_nonan_nofv) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))

                            # 1st group by year
                            ygroups, gy_data = gt.group_by_timerange(x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, 'A')

                            tn = 1
                            for n in range(len(ygroups)):
                                x_time = gy_data[n+tn].dropna(axis=0)
                                y_data = gy_data[n+(tn+1)].dropna(axis=0)
                                y_data = y_data.astype(float)
                                # 2nd group by month
                                mgroups, gm_data = gt.group_by_timerange(x_time.values, y_data.values, 'M')

                                x_year = x_time[0].year
                                print(x_year)
                                #
                                # create bins for histogram
                                mgroups_min = min(mgroups.describe()['DO']['min'])
                                mgroups_max = max(mgroups.describe()['DO']['max'])
                                lower_bound = int(round(mgroups_min))
                                upper_bound = int(round(mgroups_max + (mgroups_max / 50)))
                                step_bound = int(round((mgroups_max - mgroups_min) / 10))

                                lower_bound = int(round(global_min))
                                upper_bound = int(round(global_max + (global_max / 50)))
                                step_bound = int(round((global_max - global_min) / 10))

                                if step_bound == 0:
                                    step_bound += 1

                                if (upper_bound - lower_bound) == step_bound:
                                    lower_bound -= 1
                                    upper_bound += 1
                                if (upper_bound - lower_bound) < step_bound:
                                    step_bound = int(round(step_bound / 10))

                                bin_range = list(range(lower_bound, upper_bound, step_bound))
                                print(bin_range)

                                # create color palette

                                colors = color_names[:len(mgroups)]
                                print('1--- ', len(colors))
                                print(colors)


                                fig0, ax0 = pyplot.subplots(nrows=2, ncols=1)

                                # # subplot for  histogram and basic statistics table
                                ax0[0].axis('off')
                                ax0[0].axis('tight')

                                the_table = ax0[0].table(cellText=mgroups.describe().round(2).values,
                                                         rowLabels=mgroups.describe().index.month,
                                                         rowColours=colors,
                                                         colLabels=mgroups.describe().columns.levels[1], loc='center')
                                the_table.set_fontsize(5)

                                fig, ax = pyplot.subplots(nrows=12, ncols=1, sharey=True)

                                for kk in list(range(0, 12)):
                                    ax[kk].tick_params(axis='both', which='both', color='r', labelsize=7,
                                                       labelcolor='m', rotation=0, pad=0.1, length=1)
                                    month_name = calendar.month_abbr[kk + 1]
                                    ax[kk].set_ylabel(month_name, rotation=0, fontsize=8, color='b', labelpad=20)
                                    if kk == 0:
                                        ax[kk].set_title(str(x_year) + '\n ' + sv + " (" + sv_units + ")" +
                                                         ' Global Range: [' + str(int(global_min)) + ',' + str(int(global_max)) + ']' +
                                                         '\n End of deployments are marked with a vertical line \n ' +
                                                         'Plotted: Data, Mean and STD (Method: 1 day' +
                                                         ' rolling window calculations)',
                                                         fontsize=8)

                                    if kk < 11:
                                        ax[kk].tick_params(labelbottom=False)
                                    if kk == 11:
                                        ax[kk].set_xlabel('Days', rotation=0, fontsize=8, color='b')

                                tm = 1
                                for mt in range(len(mgroups)):
                                    x_time = gm_data[mt+tm].dropna(axis=0)
                                    y_data = gm_data[mt+(tm+1)].dropna(axis=0)

                                    if len(x_time) == 0:
                                        # ax[plt_index].tick_params(which='both', labelbottom=False, labelleft=False,
                                        #                    pad=0.1, length=1)
                                        continue

                                    x_month = x_time[0].month
                                    col_name = str(x_month)

                                    series_m = pd.DataFrame(columns=[col_name], index=x_time)
                                    series_m[col_name] = list(y_data[:])


                                    # serie_n.plot.hist(ax=ax0[0], bins=bin_range,
                                    #                   histtype='bar', color=colors[ny], stacked=True)
                                    series_m.plot.kde(ax=ax0[0], color=colors[mt])
                                    ax0[0].legend(fontsize=8, bbox_to_anchor=(0., 1.12, 1., .102), loc=3,
                                                  ncol=len(mgroups), mode="expand", borderaxespad=0.)

                                    # ax0[0].set_xticks(bin_range)
                                    ax0[0].set_xlabel('Observation Ranges' + ' (' + sv + ', ' + sv_units + ')', fontsize=8)
                                    ax0[0].set_ylabel('Density', fontsize=8)  # 'Number of Observations'
                                    ax0[0].set_title('Kernel Density Estimates', fontsize=8)
                                    ax0[0].tick_params(which='both', labelsize=7, pad=0.1, length=1, rotation=0)

                                    plt_index = x_month - 1

                                    # Plot data
                                    series_m.plot(ax=ax[plt_index], linestyle='None', marker='.', markersize=1)
                                    ax[plt_index].legend().set_visible(False)

                                    ma = series_m.rolling('86400s').mean()
                                    mstd = series_m.rolling('86400s').std()

                                    ax[plt_index].plot(ma.index, ma[col_name].values, 'b')
                                    ax[plt_index].fill_between(mstd.index, ma[col_name].values-3*mstd[col_name].values,
                                                               ma[col_name].values+3*mstd[col_name].values,
                                                               color='b', alpha=0.2)

                                    # prepare the time axis parameters
                                    mm, nod = monthrange(x_year, x_month)
                                    datemin = datetime.date(x_year, x_month, 1)
                                    datemax = datetime.date(x_year, x_month, nod)
                                    ax[plt_index].set_xlim(datemin, datemax)
                                    xlocator = mdates.DayLocator()  # every day
                                    myFmt = mdates.DateFormatter('%d')
                                    ax[plt_index].xaxis.set_major_locator(xlocator)
                                    ax[plt_index].xaxis.set_major_formatter(myFmt)
                                    ax[plt_index].xaxis.set_minor_locator(pyplot.NullLocator())
                                    ax[plt_index].xaxis.set_minor_formatter(pyplot.NullFormatter())

                                    # data_min = min(ma.DO_n.dropna(axis=0) - 5 * mstd.DO_n.dropna(axis=0))
                                    # 0data_max = max(ma.DO_n.dropna(axis=0) + 5 * mstd.DO_n.dropna(axis=0))
                                    # ax[plt_index].set_ylim([data_min, data_max])

                                    ylocator = MaxNLocator(prune='both', nbins=3)
                                    ax[plt_index].yaxis.set_major_locator(ylocator)


                                    if x_month != 12:
                                        ax[plt_index].tick_params(which='both', labelbottom=False, pad=0.1, length=1)
                                        ax[plt_index].set_xlabel(' ')
                                    else:
                                        ax[plt_index].tick_params(which='both', color='r', labelsize=7, labelcolor='m',
                                                           pad=0.1, length=1, rotation=0)
                                        ax[plt_index].set_xlabel('Days', rotation=0, fontsize=8, color='b')

                                    dep = 1
                                    for etimes in end_times:
                                        ax[plt_index].axvline(x=etimes, color='b', linestyle='--', linewidth=.8)
                                        if ma[col_name].values.any():
                                            ax[plt_index].text(etimes, max(ma[col_name].dropna(axis=0)), 'End' + str(dep),
                                                        fontsize=6, style='italic',
                                                        bbox=dict(boxstyle='round',
                                                                  ec=(0., 0.5, 0.5),
                                                                  fc=(1., 1., 1.),
                                                                  ))
                                        else:
                                            ax[plt_index].text(etimes, min(series_m['DO_n']), 'End' + str(dep),
                                                        fontsize=6, style='italic',
                                                        bbox=dict(boxstyle='round',
                                                                  ec=(0., 0.5, 0.5),
                                                                  fc=(1., 1., 1.),
                                                                  ))
                                        dep += 1
                                    tm += 1
                                tn += 1


                                # pyplot.show()
                                sfile = '_'.join((str(x_year), sname))
                                save_file = os.path.join(save_dir, sfile)
                                fig.savefig(str(save_file), dpi=150)

                                sfile = '_'.join(('Statistics', str(x_year), sname))
                                save_file = os.path.join(save_dir, sfile)
                                fig0.savefig(str(save_file), dpi=150)
def main(url_list, sDir, plot_type):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type

    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # get the list of data files and filter out collocated instruments and other streams chat
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)
        '''
        separate the data files by methods
        '''
        for ms in ms_list:  # np.unique(methodstream)
            fdatasets_sel = [x for x in fdatasets if ms in x]

            # create a folder to save figures
            save_dir = os.path.join(sDir, array, subsite, r, plot_type,
                                    ms.split('-')[0])
            cf.create_dir(save_dir)

            # create a dictionary for science variables from analysis file
            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)

            y_unit = []
            y_name = []
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print('\nAppending data file: {}'.format(fd.split('/')[-1]))
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)

                        # time
                        t = ds['time'].values
                        t0 = pd.to_datetime(
                            t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            t.max()).strftime('%Y-%m-%dT%H:%M:%S')

                        # sci variable
                        z = ds[var].values
                        sh['t'] = np.append(sh['t'], t)
                        sh['values'] = np.append(sh['values'], z)

                        # add pressure to dictionary of sci vars
                        if 'MOAS' in subsite:
                            if 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                                pressure = 'sci_water_pressure_dbar'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                            else:
                                pressure = 'int_ctd_pressure'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                        else:
                            pressure = pf.pressure_var(ds, ds.data_vars.keys())
                            y = ds[pressure].values
                            if ds[pressure].units not in y_unit:
                                y_unit.append(ds[pressure].units)
                            if ds[pressure].long_name not in y_name:
                                y_name.append(ds[pressure].long_name)

                        sh['pressure'] = np.append(sh['pressure'], y)

            if len(y_unit) != 1:
                print('pressure unit varies!')
            else:
                y_unit = y_unit[0]

            if len(y_name) != 1:
                print('pressure long name varies!')
            else:
                y_name = y_name[0]

            for m, n in sci_vars_dict.items():
                for sv, vinfo in n['vars'].items():
                    print('\nWorking on variable: {}'.format(sv))
                    if len(vinfo['t']) < 1:
                        print('no variable data to plot')
                    else:
                        sv_units = vinfo['units'][0]
                        fv = vinfo['fv'][0]
                        t0 = pd.to_datetime(min(
                            vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(
                            vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t = vinfo['t']
                        x = vinfo['values']
                        y = vinfo['pressure']

                    # Check if the array is all NaNs
                    if sum(np.isnan(x)) == len(x):
                        print('Array of all NaNs - skipping plot.')
                        continue

                    # Check if the array is all fill values
                    elif len(x[x != fv]) == 0:
                        print('Array of all fill values - skipping plot.')
                        continue

                    else:
                        # reject fill values
                        fv_ind = x != fv
                        y_nofv = y[fv_ind]
                        t_nofv = t[fv_ind]
                        c_nofv = cm.rainbow(np.linspace(0, 1, len(t[fv_ind])))
                        x_nofv = x[fv_ind]
                        print(len(x) - len(fv_ind), ' fill values')

                        # reject NaNs
                        nan_ind = ~np.isnan(x)
                        t_nofv_nonan = t_nofv[nan_ind]
                        c_nofv_nonan = c_nofv[nan_ind]
                        y_nofv_nonan = y_nofv[nan_ind]
                        x_nofv_nonan = x_nofv[nan_ind]
                        print(len(x) - len(nan_ind), ' NaNs')

                        # reject extreme values
                        ev_ind = cf.reject_extreme_values(x_nofv_nonan)
                        t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
                        c_nofv_nonan_noev = c_nofv_nonan[ev_ind]
                        y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
                        x_nofv_nonan_noev = x_nofv_nonan[ev_ind]
                        print(len(z) - len(ev_ind), ' Extreme Values', '|1e7|')

                        # reject values outside global ranges:
                        global_min, global_max = cf.get_global_ranges(r, sv)
                        # platform not in qc-table (parad_k_par)
                        # global_min = 0
                        # global_max = 2500
                        print('global ranges for : {}-{}  {} - {}'.format(
                            r, sv, global_min, global_max))
                        if isinstance(global_min, (int, float)) and isinstance(
                                global_max, (int, float)):
                            gr_ind = cf.reject_global_ranges(
                                x_nofv_nonan_noev, global_min, global_max)
                            t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[gr_ind]
                            y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[gr_ind]
                            x_nofv_nonan_noev_nogr = x_nofv_nonan_noev[gr_ind]
                        else:
                            t_nofv_nonan_noev_nogr = t_nofv_nonan_noev
                            y_nofv_nonan_noev_nogr = y_nofv_nonan_noev
                            x_nofv_nonan_noev_nogr = x_nofv_nonan_noev

                    if len(x_nofv_nonan_noev) > 0:
                        if m == 'common_stream_placeholder':
                            sname = '-'.join((r, sv))
                        else:
                            sname = '-'.join((r, m, sv))

                    if sv != 'pressure':
                        columns = ['tsec', 'dbar', str(sv)]
                        bin_size = 10
                        min_r = int(round(min(y_nofv_nonan_noev) - bin_size))
                        max_r = int(round(max(y_nofv_nonan_noev) + bin_size))
                        ranges = list(range(min_r, max_r, bin_size))
                        groups, d_groups = gt.group_by_depth_range(
                            t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr,
                            x_nofv_nonan_noev_nogr, columns, ranges)

                    y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], []
                    tm = 1
                    for ii in range(len(groups)):
                        nan_ind = d_groups[ii + tm].notnull()
                        xtime = d_groups[ii + tm][nan_ind]
                        colors = cm.rainbow(np.linspace(0, 1, len(xtime)))
                        ypres = d_groups[ii + tm + 1][nan_ind]
                        nval = d_groups[ii + tm + 2][nan_ind]
                        tm += 2

                        l_arr.append(len(
                            nval))  # count of data to filter out small groups
                        y_avg.append(ypres.mean())
                        n_avg.append(nval.mean())
                        n_min.append(nval.min())
                        n_max.append(nval.max())
                        n_std = 3
                        n0_std.append(nval.mean() + n_std * nval.std())
                        n1_std.append(nval.mean() - n_std * nval.std())

                    # Plot all data
                    ylabel = y_name + " (" + y_unit + ")"
                    xlabel = sv + " (" + sv_units + ")"
                    clabel = 'Time'

                    fig, ax = pf.plot_profiles(x_nofv_nonan_noev_nogr,
                                               y_nofv_nonan_noev_nogr,
                                               t_nofv_nonan_noev_nogr,
                                               ylabel,
                                               xlabel,
                                               clabel,
                                               end_times,
                                               deployments,
                                               stdev=None)

                    title_text = ' '.join((r, ms.split('-')[-1])) + '\n' \
                                 + t0 + ' - ' + t1 + '\n' + str(bin_size) +\
                                 ' m average and ' + str(n_std) + ' std shown'

                    ax.set_title(title_text, fontsize=9)
                    ax.plot(n_avg, y_avg, '-k')

                    ax.fill_betweenx(y_avg,
                                     n0_std,
                                     n1_std,
                                     color='m',
                                     alpha=0.2)
                    pf.save_fig(save_dir, sname)

                    # Plot data with outliers removed

                    fig, ax = pf.plot_profiles(x_nofv_nonan_noev_nogr,
                                               y_nofv_nonan_noev_nogr,
                                               t_nofv_nonan_noev_nogr,
                                               ylabel,
                                               xlabel,
                                               clabel,
                                               end_times,
                                               deployments,
                                               stdev=5)
                    ax.set_title(' '.join((r, ms.split('-')[-1])) + '\n' \
                                 + t0 + ' - ' + t1, fontsize=9)
                    sfile = '_'.join((sname, 'rmoutliers'))
                    pf.save_fig(save_dir, sfile)
Example #12
0
def main(sDir, url_list):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get end times of deployments
        dr_data = cf.refdes_datareview_json(r)
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # filter datasets
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        main_sensor = r.split('-')[-1]
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        methodstream = []
        for f in fdatasets:
            methodstream.append('-'.join((f.split('/')[-2].split('-')[-2],
                                          f.split('/')[-2].split('-')[-1])))

        for ms in np.unique(methodstream):
            fdatasets_sel = [x for x in fdatasets if ms in x]
            save_dir = os.path.join(sDir, array, subsite, r,
                                    'timeseries_daily_plots',
                                    ms.split('-')[0])
            cf.create_dir(save_dir)

            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)
            print('\nAppending data from files: {}'.format(ms))
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)
                        tD = ds['time'].values
                        varD = ds[var].values
                        sh['t'] = np.append(sh['t'], tD)
                        sh['values'] = np.append(sh['values'], varD)

            print('\nPlotting data')
            for m, n in sci_vars_dict.items():
                for sv, vinfo in n['vars'].items():
                    print(sv)
                    if len(vinfo['t']) < 1:
                        print('no variable data to plot')
                    else:
                        sv_units = vinfo['units'][0]
                        t0 = pd.to_datetime(min(
                            vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(
                            vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        x = vinfo['t']
                        y = vinfo['values']

                        # reject NaNs
                        nan_ind = ~np.isnan(y)
                        x_nonan = x[nan_ind]
                        y_nonan = y[nan_ind]

                        # reject fill values
                        fv_ind = y_nonan != vinfo['fv'][0]
                        x_nonan_nofv = x_nonan[fv_ind]
                        y_nonan_nofv = y_nonan[fv_ind]

                        # reject extreme values
                        Ev_ind = cf.reject_extreme_values(y_nonan_nofv)
                        y_nonan_nofv_nE = y_nonan_nofv[Ev_ind]
                        x_nonan_nofv_nE = x_nonan_nofv[Ev_ind]

                        # reject values outside global ranges:
                        global_min, global_max = cf.get_global_ranges(r, sv)
                        gr_ind = cf.reject_global_ranges(
                            y_nonan_nofv_nE, global_min, global_max)
                        y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind]
                        x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind]

                        if len(y_nonan_nofv) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))

                            # 1st group by year
                            ygroups, gy_data = gt.group_by_timerange(
                                x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr,
                                'A')

                            tn = 1
                            for n in range(len(ygroups)):
                                x_time = gy_data[n + tn].dropna(axis=0)
                                y_data = gy_data[n + (tn + 1)].dropna(axis=0)

                                # 2nd group by month
                                mgroups, gm_data = gt.group_by_timerange(
                                    x_time.values, y_data.values, 'M')

                                if len(x_time) == 0:
                                    continue

                                td = 1
                                for jj in range(len(mgroups)):
                                    x_time = gm_data[jj + td].dropna(axis=0)
                                    y_data = gm_data[jj +
                                                     (td + 1)].dropna(axis=0)

                                    if len(x_time) == 0:
                                        continue

                                    # 3rd group by day
                                    dgroups, gd_data = gt.group_by_timerange(
                                        x_time.values, y_data.values, 'D')

                                    x_year = x_time[0].year
                                    x_month = x_time[0].month
                                    month_name = calendar.month_abbr[x_month]
                                    print(x_year, x_month)

                                    sfile = '_'.join(
                                        (str(x_year), str(x_month), sname))

                                    # prepare plot layout

                                    fig, ax = pyplot.subplots(nrows=7,
                                                              ncols=5,
                                                              sharey=True)
                                    title_in = month_name + '-' + str(x_year) + \
                                                  ' calendar days \n Parameter: ' + \
                                                  sv + " (" + sv_units + ")"

                                    ax[0][2].text(0.5,
                                                  1.5,
                                                  title_in,
                                                  horizontalalignment='center',
                                                  fontsize=8,
                                                  transform=ax[0][2].transAxes)
                                    num_i = 0
                                    day_i = {}
                                    for kk in list(range(0, 7)):
                                        for ff in list(range(0, 5)):
                                            num_i += 1
                                            day_i[num_i] = [kk, ff]
                                            ax[kk][ff].tick_params(
                                                axis='both',
                                                which='both',
                                                color='r',
                                                labelsize=7,
                                                labelcolor='m',
                                                rotation=0)

                                            ax[kk][ff].text(
                                                0.1,
                                                0.75,
                                                str(num_i),
                                                horizontalalignment='center',
                                                fontsize=7,
                                                transform=ax[kk][ff].transAxes,
                                                bbox=dict(
                                                    boxstyle="round",
                                                    ec=(0., 0.5, 0.5),
                                                    fc=(1., 1., 1.),
                                                ))

                                            if kk is not 6:
                                                ax[kk][ff].tick_params(
                                                    labelbottom=False)
                                            if ff is not 0:
                                                ax[kk][ff].tick_params(
                                                    labelright=False)

                                            if kk is 6 and ff is 0:
                                                ax[kk][ff].set_xlabel(
                                                    'Hours',
                                                    rotation=0,
                                                    fontsize=8,
                                                    color='b')

                                            if kk is 6 and ff in list(
                                                    range(1, 5)):
                                                fig.delaxes(ax[kk][ff])

                                    tm = 1
                                    for mt in range(len(dgroups)):
                                        x_time = gd_data[mt +
                                                         tm].dropna(axis=0)
                                        y_DO = gd_data[mt +
                                                       (tm + 1)].dropna(axis=0)

                                        series_m = pd.DataFrame(
                                            columns=['DO_n'], index=x_time)
                                        series_m['DO_n'] = list(y_DO[:])

                                        if len(x_time) == 0:
                                            continue

                                        x_day = x_time[0].day

                                        print(x_time[0].year, x_time[0].month,
                                              x_day)

                                        i0 = day_i[x_day][0]
                                        i1 = day_i[x_day][1]

                                        # Plot data
                                        series_m.plot(ax=ax[i0][i1],
                                                      linestyle='None',
                                                      marker='.',
                                                      markersize=1)
                                        ax[i0][i1].legend().set_visible(False)

                                        ma = series_m.rolling('3600s').mean()
                                        mstd = series_m.rolling('3600s').std()

                                        ax[i0][i1].plot(ma.index,
                                                        ma.DO_n,
                                                        'b',
                                                        linewidth=0.25)
                                        ax[i0][i1].fill_between(
                                            mstd.index,
                                            ma.DO_n - 3 * mstd.DO_n,
                                            ma.DO_n + 3 * mstd.DO_n,
                                            color='b',
                                            alpha=0.2)

                                        # prepare the time axis parameters
                                        datemin = datetime.datetime(
                                            x_year, x_month, x_day, 0)
                                        datemax = datetime.datetime(
                                            x_year, x_month, x_day, 23)

                                        ax[i0][i1].set_xlim(datemin, datemax)
                                        xLocator = mdates.HourLocator(
                                            interval=4)  # every hour
                                        myFmt = mdates.DateFormatter('%H')
                                        ax[i0][i1].xaxis.set_minor_locator(
                                            xLocator)
                                        ax[i0][i1].xaxis.set_minor_formatter(
                                            myFmt)
                                        ax[i0][i1].xaxis.set_major_locator(
                                            pyplot.NullLocator())
                                        ax[i0][i1].xaxis.set_major_formatter(
                                            pyplot.NullFormatter())
                                        yLocator = MaxNLocator(prune='both',
                                                               nbins=3)
                                        ax[i0][i1].yaxis.set_major_locator(
                                            yLocator)

                                        if x_day is not 31:
                                            ax[i0][i1].tick_params(
                                                labelbottom=False)
                                            ax[i0][i1].set_xlabel(' ')
                                        else:
                                            ax[i0][i1].tick_params(
                                                which='both',
                                                color='r',
                                                labelsize=7,
                                                labelcolor='m',
                                                length=0.1,
                                                pad=0.1)
                                            ax[i0][i1].set_xlabel('Hours',
                                                                  rotation=0,
                                                                  fontsize=8,
                                                                  color='b')

                                        ymin, ymax = ax[i0][i1].get_ylim()
                                        dep = 1
                                        for etimes in end_times:
                                            ax[i0][i1].axvline(x=etimes,
                                                               color='b',
                                                               linestyle='--',
                                                               linewidth=.6)
                                            ax[i0][i1].text(
                                                etimes,
                                                ymin + 50,
                                                str(dep),
                                                fontsize=6,
                                                style='italic',
                                                bbox=dict(
                                                    boxstyle="round",
                                                    ec=(0., 0.5, 0.5),
                                                    fc=(1., 1., 1.),
                                                ))

                                            dep += 1
                                        tm += 1
                                    td += 1
                                    pf.save_fig(save_dir, sfile)
                                tn += 1
Example #13
0
def main(url_list, sDir, plot_type):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type
    
    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get preferred stream
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = cf.get_deployment_information(
                dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # get the list of data files and filter out collocated instruments and other streams
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)
        '''
        separate data files by methods
        '''
        for ms in ms_list:
            fdatasets_sel = [x for x in fdatasets if ms in x]

            # create a folder to save figures
            save_dir = os.path.join(sDir, array, subsite, r, plot_type,
                                    ms.split('-')[0])
            cf.create_dir(save_dir)

            # create a dictionary for science variables from analysis file
            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)

            print('\nAppending data from files: {}'.format(ms))
            y_unit = []
            y_name = []
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print('\nAppending data file: {}'.format(fd.split('/')[-1]))
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)

                        # time
                        t = ds['time'].values
                        t0 = pd.to_datetime(
                            t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            t.max()).strftime('%Y-%m-%dT%H:%M:%S')

                        # sci variable
                        z = ds[var].values
                        sh['t'] = np.append(sh['t'], t)
                        sh['values'] = np.append(sh['values'], z)

                        # add pressure to dictionary of sci vars
                        if 'MOAS' in subsite:
                            if 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                                pressure = 'sci_water_pressure_dbar'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                            else:
                                pressure = 'int_ctd_pressure'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                        else:
                            pressure = pf.pressure_var(ds, ds.data_vars.keys())
                            y = ds[pressure].values

                        sh['pressure'] = np.append(sh['pressure'], y)

                        try:
                            ds[pressure].units
                            if ds[pressure].units not in y_unit:
                                y_unit.append(ds[pressure].units)
                        except AttributeError:
                            print('pressure attributes missing units')
                            if 'pressure unit missing' not in y_unit:
                                y_unit.append('pressure unit missing')

                        try:
                            ds[pressure].long_name
                            if ds[pressure].long_name not in y_name:
                                y_name.append(ds[pressure].long_name)
                        except AttributeError:
                            print('pressure attributes missing long_name')
                            if 'pressure long name missing' not in y_name:
                                y_name.append('pressure long name missing')

            # create a csv file with diagnostic results:

                if len(y_unit) != 1:
                    print('pressure unit varies')
                    if 'dbar' in y_unit:
                        y_unit = 'dbar'
                    print(y_unit)
                else:
                    y_unit = y_unit[0]

                if len(y_name) != 1:
                    print('pressure long name varies')
                    if 'Seawater Pressure' in y_name:
                        y_name = 'Seawater Pressure'
                    print(y_name)
                else:
                    y_name = y_name[0]

                # create a folder to save variables statistics
                mDir = '/Users/leila/Documents/NSFEduSupport/github/data-review-tools/data_review/final_stats'
                save_dir_stat = os.path.join(mDir, array, subsite)
                cf.create_dir(save_dir_stat)
                stat_df = pd.DataFrame()
                for m, n in sci_vars_dict.items():
                    for sv, vinfo in n['vars'].items():
                        print(sv)
                        if len(vinfo['t']) < 1:
                            print('no variable data to plot')
                        else:
                            sv_units = vinfo['units'][0]
                            fv = vinfo['fv'][0]
                            t0 = pd.to_datetime(min(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t1 = pd.to_datetime(max(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t = vinfo['t']
                            z = vinfo['values']
                            y = vinfo['pressure']

                            title = ' '.join((r, ms))

                        # Check if the array is all NaNs
                        if sum(np.isnan(z)) == len(z):
                            print('Array of all NaNs - skipping plot.')
                            continue

                        # Check if the array is all fill values
                        elif len(z[z != fv]) == 0:
                            print('Array of all fill values - skipping plot.')
                            continue

                        else:
                            # reject fill values
                            fv_ind = z != fv
                            y_nofv = y[fv_ind]
                            t_nofv = t[fv_ind]
                            z_nofv = z[fv_ind]
                            print(len(z) - len(fv_ind), ' fill values')

                            # reject NaNs
                            nan_ind = ~np.isnan(z_nofv)
                            t_nofv_nonan = t_nofv[nan_ind]
                            y_nofv_nonan = y_nofv[nan_ind]
                            z_nofv_nonan = z_nofv[nan_ind]
                            print(len(z) - len(nan_ind), ' NaNs')

                            # reject extreme values
                            ev_ind = cf.reject_extreme_values(z_nofv_nonan)
                            t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
                            y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
                            z_nofv_nonan_noev = z_nofv_nonan[ev_ind]
                            print(
                                len(z) - len(ev_ind), ' Extreme Values',
                                '|1e7|')

                            # reject values outside global ranges:
                            global_min, global_max = cf.get_global_ranges(
                                r, sv)
                            # platform not in qc-table (parad_k_par)
                            # global_min = 0
                            # global_max = 2500
                            print('global ranges for : {}-{}  {} - {}'.format(
                                r, sv, global_min, global_max))
                            if isinstance(global_min,
                                          (int, float)) and isinstance(
                                              global_max, (int, float)):
                                gr_ind = cf.reject_global_ranges(
                                    z_nofv_nonan_noev, global_min, global_max)
                                t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[
                                    gr_ind]
                                y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[
                                    gr_ind]
                                z_nofv_nonan_noev_nogr = z_nofv_nonan_noev[
                                    gr_ind]
                            else:
                                t_nofv_nonan_noev_nogr = t_nofv_nonan_noev
                                y_nofv_nonan_noev_nogr = y_nofv_nonan_noev
                                z_nofv_nonan_noev_nogr = z_nofv_nonan_noev

                        if len(z_nofv_nonan_noev) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))

                        # group by depth range
                        sname = '_'.join((sname, sv_units))

                        # if sv != 'pressure':
                        #     columns = ['tsec', 'dbar', str(sv)]
                        #
                        #     # select depth bin size for the data group function
                        #     bin_size = 10
                        #     min_r = int(round(min(y_nofv_nonan_noev) - bin_size))
                        #     max_r = int(round(max(y_nofv_nonan_noev) + bin_size))
                        #     ranges = list(range(min_r, max_r, bin_size))
                        #     groups, d_groups = gt.group_by_depth_range(t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr,
                        #                                                z_nofv_nonan_noev_nogr, columns, ranges)
                        #

                        # if (ms.split('-')[0]) == (ps_df[0].values[0].split('-')[0]):
                        #     if 'pressure' not in sv:
                        #         print('final_stats_{}-{}-{}-{}'.format(r,
                        #                                                ms.split('-')[0],
                        #                                                ps_df[0].values[0].split('-')[0],
                        #                                                sv))
                        #         stat_data = groups.describe()[sv]
                        #         stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False)
                        #         stat_df = stat_df.append(stat_data)

                        # if sv == 'optical_backscatter':
                        #     less_ind = z_nofv_nonan_noev < 0.0004
                        #     print(sv, ' < 0.0004', len(less_ind))
                        #     more_ind = z_nofv_nonan_noev > 0.01
                        #     print(sv, ' > 0.01', len(more_ind))

                        # Plot all data
                        clabel = sv + " (" + sv_units + ")"
                        ylabel = y_name + " (" + y_unit + ")"

                        fig, ax = pf.plot_xsection(subsite,
                                                   t_nofv_nonan_noev,
                                                   y_nofv_nonan_noev,
                                                   z_nofv_nonan_noev,
                                                   clabel,
                                                   ylabel,
                                                   stdev=None)

                        ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                     fontsize=9)

                        pf.save_fig(save_dir, sname)

                        # Plot data with outliers removed
                        fig, ax = pf.plot_xsection(subsite,
                                                   t_nofv_nonan_noev_nogr,
                                                   y_nofv_nonan_noev_nogr,
                                                   z_nofv_nonan_noev_nogr,
                                                   clabel,
                                                   ylabel,
                                                   stdev=5)
                        ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                     fontsize=9)
                        sfile = '_'.join((sname, 'rmoutliers'))
                        pf.save_fig(save_dir, sfile)

                        # plot data with excluded time range removed
                        dr = pd.read_csv(
                            'https://datareview.marine.rutgers.edu/notes/export'
                        )
                        drn = dr.loc[dr.type == 'exclusion']
                        if len(drn) != 0:
                            subsite_node = '-'.join((subsite, r.split('-')[1]))
                            drne = drn.loc[drn.reference_designator.isin(
                                [subsite, subsite_node, r])]

                            t_ex = t_nofv_nonan_noev_nogr
                            y_ex = y_nofv_nonan_noev_nogr
                            z_ex = z_nofv_nonan_noev_nogr
                            for i, row in drne.iterrows():
                                sdate = cf.format_dates(row.start_date)
                                edate = cf.format_dates(row.end_date)
                                ts = np.datetime64(sdate)
                                te = np.datetime64(edate)
                                ind = np.where((t_ex < ts) | (t_ex > te), True,
                                               False)
                                if len(ind) != 0:
                                    t_ex = t_ex[ind]
                                    z_ex = z_ex[ind]
                                    y_ex = y_ex[ind]

                            fig, ax = pf.plot_xsection(subsite,
                                                       t_ex,
                                                       y_ex,
                                                       z_ex,
                                                       clabel,
                                                       ylabel,
                                                       stdev=None)
                            ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                         fontsize=9)

                            sfile = '_'.join((sname, 'rmsuspectdata'))
                            pf.save_fig(save_dir, sfile)
Example #14
0
def main(url_list, sDir, plot_type, deployment_num, start_time, end_time):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type

    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # get the list of data files and filter out collocated instruments and other streams chat
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)
        '''
        separate the data files by methods
        '''
        for ms in ms_list:
            fdatasets_sel = [x for x in fdatasets if ms in x]

            # create a dictionary for science variables from analysis file
            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)

            print('\nAppending data from files: {}'.format(ms))
            y_unit = []
            y_name = []
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print(fd)

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print(
                            'No data to plot for specified time range: ({} to {})'
                            .format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                    fd)

                if deployment_num is not None:
                    if int(deployment.split('0')[-1]) is not deployment_num:
                        print(type(int(deployment.split('0')[-1])),
                              type(deployment_num))
                        continue

                save_dir = os.path.join(sDir, array, subsite, refdes,
                                        plot_type,
                                        ms.split('-')[0], deployment)
                cf.create_dir(save_dir)

                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)

                        # time
                        t = ds['time'].values
                        t0 = pd.to_datetime(
                            t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            t.max()).strftime('%Y-%m-%dT%H:%M:%S')

                        # sci variable
                        z = ds[var].values
                        sh['t'] = np.append(sh['t'], t)
                        sh['values'] = np.append(sh['values'], z)

                        # add pressure to dictionary of sci vars
                        if 'MOAS' in subsite:
                            if 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                                pressure = 'sci_water_pressure_dbar'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                            else:
                                pressure = 'int_ctd_pressure'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                        else:
                            pressure = pf.pressure_var(ds, ds.data_vars.keys())
                            y = ds[pressure].values
                            if ds[pressure].units not in y_unit:
                                y_unit.append(ds[pressure].units)
                            if ds[pressure].long_name not in y_name:
                                y_name.append(ds[pressure].long_name)

                        sh['pressure'] = np.append(sh['pressure'], y)

                if len(y_unit) != 1:
                    print('pressure unit varies UHHHHHHHHH')
                else:
                    y_unit = y_unit[0]

                if len(y_name) != 1:
                    print('pressure long name varies UHHHHHHHHH')
                else:
                    y_name = y_name[0]

                for m, n in sci_vars_dict.items():
                    for sv, vinfo in n['vars'].items():
                        print(sv)
                        if len(vinfo['t']) < 1:
                            print('no variable data to plot')
                        else:
                            sv_units = vinfo['units'][0]
                            fv = vinfo['fv'][0]
                            t0 = pd.to_datetime(min(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t1 = pd.to_datetime(max(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t = vinfo['t']
                            z = vinfo['values']
                            y = vinfo['pressure']

                            title = ' '.join((r, ms.split('-')[1]))

                        # Check if the array is all NaNs
                        if sum(np.isnan(z)) == len(z):
                            print('Array of all NaNs - skipping plot.')

                        # Check if the array is all fill values
                        elif len(z[z != fv]) == 0:
                            print('Array of all fill values - skipping plot.')

                        else:
                            # reject fill values
                            fv_ind = z != fv
                            y_nofv = y[fv_ind]
                            t_nofv = t[fv_ind]
                            z_nofv = z[fv_ind]
                            print(len(z) - len(fv_ind), ' fill values')

                            # reject NaNs
                            nan_ind = ~np.isnan(z)
                            t_nofv_nonan = t_nofv[nan_ind]
                            y_nofv_nonan = y_nofv[nan_ind]
                            z_nofv_nonan = z_nofv[nan_ind]
                            print(len(z) - len(nan_ind), ' NaNs')

                            # reject extreme values
                            ev_ind = cf.reject_extreme_values(z_nofv_nonan)
                            t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
                            colors = cm.rainbow(
                                np.linspace(0, 1, len(t_nofv_nonan_noev)))
                            y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
                            z_nofv_nonan_noev = z_nofv_nonan[ev_ind]
                            print(
                                len(z) - len(ev_ind), ' Extreme Values',
                                '|1e7|')

                        if len(y_nofv_nonan_noev) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))
                        # Plot all data
                        ylabel = y_name + " (" + y_unit + ")"
                        xlabel = sv + " (" + sv_units + ")"
                        clabel = 'Time'
                        clabel = sv + " (" + sv_units + ")"

                        fig, ax = pf.plot_profiles(z_nofv_nonan_noev,
                                                   y_nofv_nonan_noev,
                                                   colors,
                                                   xlabel,
                                                   ylabel,
                                                   stdev=None)
                        ax.set_title((
                            title + '\n' + str(deployment_num) + ': ' + t0 +
                            ' - ' + t1 + '\n' +
                            'used bin = 2 dbar to calculate an average profile (black line) and 3-STD envelope (shaded area)'
                        ),
                                     fontsize=9)

                        # group by depth range
                        columns = ['time', 'pressure', str(sv)]
                        # ranges = [0, 50, 100, 200, 400, 600]
                        ranges = list(
                            range(int(round(min(y_nofv_nonan_noev))),
                                  int(round(max(y_nofv_nonan_noev))), 1))
                        groups, d_groups = gt.group_by_depth_range(
                            t_nofv_nonan_noev, y_nofv_nonan_noev,
                            z_nofv_nonan_noev, columns, ranges)

                        # describe_file = '_'.join((sname, 'statistics.csv'))
                        # # groups.describe().to_csv(save_dir + '/' + describe_file)
                        ind = groups.describe()[sv]['mean'].notnull()
                        groups.describe()[sv][ind].to_csv(
                            '{}/{}_statistics.csv'.format(save_dir, sname),
                            index=True)

                        tm = 1
                        fig, ax = pyplot.subplots(nrows=2, ncols=1)
                        pyplot.margins(y=.08, x=.02)
                        pyplot.grid()
                        y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], []

                        for ii in range(len(groups)):

                            nan_ind = d_groups[ii + tm].notnull()
                            xtime = d_groups[ii + tm][nan_ind]
                            colors = cm.rainbow(np.linspace(0, 1, len(xtime)))
                            ypres = d_groups[ii + tm + 1][nan_ind]
                            nval = d_groups[ii + tm + 2][nan_ind]
                            tm += 2

                            # fig, ax = pf.plot_xsection(subsite, xtime, ypres, nval, clabel, ylabel, stdev=None)
                            # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)

                            # pf.plot_profiles(nval, ypres, colors, ylabel, clabel, stdev=None)
                            # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)

                            ind2 = cf.reject_outliers(nval, 5)
                            xD = nval[ind2]
                            yD = ypres[ind2]
                            nZ = colors[ind2]
                            outliers = str(len(nval) - len(xD))
                            leg_text = ('removed {} outliers (SD={})'.format(
                                outliers, stdev), )

                            ax.scatter(xD, yD, c=nZ, s=2, edgecolor='None')
                            ax.invert_yaxis()
                            ax.set_xlabel(clabel, fontsize=9)
                            ax.set_ylabel(ylabel, fontsize=9)
                            ax.legend(leg_text, loc='best', fontsize=6)
                            ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                         fontsize=9)

                            l_arr.append(
                                len(nval)
                            )  #  count of data to filter out small groups
                            y_avg.append(ypres.mean())
                            n_avg.append(nval.mean())
                            n_min.append(nval.min())
                            n_max.append(nval.max())
                            n0_std.append(nval.mean() + 3 * nval.std())
                            n1_std.append(nval.mean() - 3 * nval.std())

                        ax.plot(n_avg, y_avg, '-k')
                        # ax.plot(n_min, y_avg, '-b')
                        # ax.plot(n_max, y_avg, '-b')
                        ax.fill_betweenx(y_avg,
                                         n0_std,
                                         n1_std,
                                         color='m',
                                         alpha=0.2)
                        sfile = '_'.join((sname, 'statistics'))
                        pf.save_fig(save_dir, sfile)