Esempio n. 1
0
def to_points_loop_wg10(loc_id,
                        points,
                        fname,
                        start_year,
                        end_year,
                        djf=False):

    from dask.diagnostics import ProgressBar
    import gc
    ProgressBar().register()

    dates = []
    if djf:
        for y in np.arange(start_year, end_year + 1):
            for m in [1, 2, 12]:
                dates.append(dt.datetime(y, m, 1, 0, 0, 0))
    else:
        for y in np.arange(start_year, end_year + 1):
            for m in np.arange(1, 13):
                dates.append(dt.datetime(y, m, 1, 0, 0, 0))

    df = pd.DataFrame()

    lsm = xr.open_dataset(
        "/g/data/ma05/BARRA_AD/v1/static/lnd_mask-fc-slv-PT0H-BARRA_AD-v1.nc")

    #Read netcdf data
    for t in np.arange(len(dates)):
        print(dates[t])
        year = dt.datetime.strftime(dates[t], "%Y")
        month = dt.datetime.strftime(dates[t], "%m")
        f = xr.open_mfdataset("/g/data/ma05/BARRA_AD/v1/forecast/spec/max_max_wndgust10m/"+\
         year+"/"+month+"/*.sub.nc", concat_dim="time")

        #Setup lsm
        lat = f.coords.get("latitude").values
        lon = f.coords.get("longitude").values
        x, y = np.meshgrid(lon, lat)
        x[lsm.lnd_mask == 0] = np.nan
        y[lsm.lnd_mask == 0] = np.nan

        dist_lon = []
        dist_lat = []
        for i in np.arange(len(loc_id)):

            dist = np.sqrt(np.square(x-points[i][0]) + \
             np.square(y-points[i][1]))
            temp_lat, temp_lon = np.unravel_index(np.nanargmin(dist),
                                                  dist.shape)
            dist_lon.append(temp_lon)
            dist_lat.append(temp_lat)

        temp_df = f["max_max_wndgust10m"].isel(latitude = xr.DataArray(dist_lat, dims="points"), \
                                      longitude = xr.DataArray(dist_lon, dims="points")).persist().to_dataframe()
        temp_df = temp_df.reset_index()

        for p in np.arange(len(loc_id)):
            temp_df.loc[temp_df.points == p, "loc_id"] = loc_id[p]

        temp_df = temp_df.drop(["points",\
         "forecast_period", "forecast_reference_time"],axis=1)
        df = pd.concat([df, temp_df])
        f.close()
        gc.collect()

    df.sort_values([
        "loc_id", "time"
    ]).to_pickle("/g/data/eg3/ab4502/ExtremeWind/points/" + fname + ".pkl")
Esempio n. 2
0
import argparse
import datetime
import glob
import math
import numpy as np
import os
import pandas as pd
import rasterio
import seaborn as sns
import xarray as xr

from matplotlib import pyplot as plt

from dask.diagnostics import ProgressBar
ProgressBar().register()

from paths_usa import *

parser = argparse.ArgumentParser(description='Insert optionally GWA')
parser.add_argument('-GWA')
args = parser.parse_args()
if (args.GWA == None):
    GWA = "3"
else:
    GWA = args.GWA

if GWA == "2":
    results_pathg = results_path + '/results_GWA2'
else:
    results_pathg = results_path
Esempio n. 3
0
def estimate_shifts_old(mn_list,
                        temp_list,
                        z_thres=None,
                        rm_background=False,
                        method='first',
                        concat_dim='session'):
    temps = []
    for imn, mn_path in enumerate(mn_list):
        print("loading template: {:2d}/{:2d}".format(imn, len(mn_list)))
        try:
            with xr.open_dataset(mn_path,
                                 chunks=dict(width='auto',
                                             height='auto'))['org'] as cur_va:
                if temp_list[imn] == 'first':
                    cur_temp = cur_va.isel(frame=0).load().copy()
                elif temp_list[imn] == 'last':
                    cur_temp = cur_va.isel(frame=-1).load().copy()
                elif temp_list[imn] == 'mean':
                    cur_temp = (cur_va.mean('frame'))
                    with ProgressBar():
                        cur_temp = cur_temp.compute()
                else:
                    print("unrecognized template")
                    continue
                if rm_background:
                    cur_temp = remove_background(cur_temp, 'uniform', wnd=51)
                temps.append(cur_temp)
        except KeyError:
            print("no video found for path {}".format(mn_path))
    if concat_dim:
        temps = xr.concat(temps, dim=concat_dim).rename('temps')
        window = ~temps.isnull().sum(concat_dim).astype(bool)
        temps = temps.where(window, drop=True)
    shifts = []
    corrs = []
    for itemp, temp_dst in temps.rolling(**{concat_dim: 1}):
        print("processing: {}".format(itemp.values))
        if method == 'first':
            temp_src = temps.isel(**{concat_dim: 0})
        elif method == 'last':
            temp_src = temps.isel(**{concat_dim: -1})
        # common = (temp_src.isnull() + temp_dst.isnull())
        # temp_src = temp_src.reindex_like(common)
        # temp_dst = temp_dst.reindex_like(common)
        temp_src, temp_dst = temp_src.squeeze(), temp_dst.squeeze()
        src_fft = np.fft.fft2(temp_src)
        dst_fft = np.fft.fft2(temp_dst)
        cur_res = shift_fft(src_fft, dst_fft)
        cur_sh = cur_res[0:2]
        cur_cor = cur_res[2]
        cur_anm = temp_dst.coords['animal']
        cur_ss = temp_dst.coords['session']
        cur_ssid = temp_dst.coords['session_id']
        cur_sh = xr.DataArray(cur_sh,
                              coords=dict(shift_dim=list(temp_dst.dims)),
                              dims=['shift_dim'])
        cur_cor = xr.DataArray(cur_cor)
        cur_sh = cur_sh.assign_coords(animal=cur_anm,
                                      session=cur_ss,
                                      session_id=cur_ssid)
        cur_cor = cur_cor.assign_coords(animal=cur_anm,
                                        session=cur_ss,
                                        session_id=cur_ssid)
        shifts.append(cur_sh)
        corrs.append(cur_cor)
    if concat_dim:
        shifts = xr.concat(shifts, dim=concat_dim).rename('shifts')
        corrs = xr.concat(corrs, dim=concat_dim).rename('corrs')
        temps = xr.concat(temps, dim=concat_dim).rename('temps')
    return shifts, corrs, temps
Esempio n. 4
0
 def _mod_mean(self):
     if type(self.fname) is str:
         if self._var == None:
             try:
                 varbl = self.fname.split('/')[-1].split('_')[0]
             except:
                 varbl = 'Unknown'
         else:
             varbl = self._var
         data = xr.open_mfdataset(self.fname)[varbl]
     else:
         data = self.fname
         varbl = self._var
     if self.modMean != None:
         ds = data.mean(dim='ens')
         name_string = 'modMean'
     if self.zonMean != None:
         ds = data.mean(dim=data.dims[-1])
         name_string = 'zonMean'
     elif self.modStd != None:
         ds = data.std(dim='ens')
         name_string = 'modStd'
     elif self.monClim != None:
         ds = data.groupby('time.month').mean('time')
         name_string = 'monClim'
     elif self.monAnom != None:
         climatology = data.groupby('time.month').mean('time')
         ds = data.groupby('time.month') - climatology
         name_string = 'monAnom'
     elif self.modAnom != None:
         name_string = 'modAnom'
         if self.init != None or self.end != None:
             sub = data.sel(time=slice(str(self.init), str(self.end))).mean(
                 dim='time')
             ds = data - sub
         else:
             ds = data - data.mean(dim='time')
     elif self.trend != None:
         trend = trend_calc(data,
                            int(self.init),
                            int(self.end),
                            ci=float(self.ci))
         t = xr.DataArray(trend[0],
                          dims=['lat', 'lon'],
                          coords={
                              'lat': data.lat,
                              'lon': data.lon
                          })
         t.name = 'trend'
         s = xr.DataArray(trend[1],
                          dims=['lat', 'lon'],
                          coords={
                              'lat': data.lat,
                              'lon': data.lon
                          })
         s.name = 'sig'
         ds = xr.merge([t, s])
         name_string = 'trend'
     elif self.aggr != None:
         ds = get_aggr(data,
                       int(self.init),
                       int(self.end),
                       ci=float(self.ci))
         ds.name = 'model_agreement'
         name_string = 'model_agreement'
     if self.nc == 'yes':
         with ProgressBar():
             if self.out != None:
                 ds.load().to_netcdf(self.out)
             else:
                 try:
                     ds.load().to_netcdf(
                         self.fname.split('.nc')[0] + '_' + name_string +
                         '.nc')
                 except:
                     ds.load().to_netcdf(name_string + '.nc')
     else:
         return ds
def predict(args):
    # Convert source data into dask arrays
    sky_model = parse_sky_model(args.sky_model, args.model_chunks)

    # Get the support tables
    tables = support_tables(args)

    ant_ds = tables["ANTENNA"]
    field_ds = tables["FIELD"]
    ddid_ds = tables["DATA_DESCRIPTION"]
    spw_ds = tables["SPECTRAL_WINDOW"]
    pol_ds = tables["POLARIZATION"]

    # List of write operations
    writes = []

    # Construct a graph for each DATA_DESC_ID
    for xds in xds_from_ms(
            args.ms,
            columns=["UVW", "ANTENNA1", "ANTENNA2", "TIME"],
            group_cols=["FIELD_ID", "DATA_DESC_ID"],
            chunks={"row": args.row_chunks},
    ):

        # Perform subtable joins
        ant = ant_ds[0]
        field = field_ds[xds.attrs["FIELD_ID"]]
        ddid = ddid_ds[xds.attrs["DATA_DESC_ID"]]
        spw = spw_ds[ddid.SPECTRAL_WINDOW_ID.data[0]]
        pol = pol_ds[ddid.POLARIZATION_ID.data[0]]

        # Select single dataset row out
        corrs = pol.NUM_CORR.data[0]

        # Generate visibility expressions for each source type
        source_vis = [
            vis_factory(args, stype, sky_model, xds, ant, field, spw, pol)
            for stype in sky_model.keys()
        ]

        # Sum visibilities together
        vis = sum(source_vis)

        # Reshape (2, 2) correlation to shape (4,)
        if corrs == 4:
            vis = vis.reshape(vis.shape[:2] + (4, ))

        # Assign visibilities to MODEL_DATA array on the dataset
        xds = (xds.assign(MODEL_DATA=(("row", "chan", "corr"),
                                      vis)) if args.data_column == "MODEL_DATA"
               else xds.assign(CORRECTED_DATA=(("row", "chan", "corr"), vis)))

        # Create a write to the table
        write = xds_to_table(xds, args.ms, [args.data_column])

        # Add to the list of writes
        writes.append(write)

    # Submit all graph computations in parallel
    with ProgressBar():
        da.compute(writes)
Esempio n. 6
0
def is_valid_set_of_spect_files(spect_paths,
                                spect_format,
                                freqbins_key='f',
                                timebins_key='t',
                                spect_key='s',
                                n_decimals_trunc=5,
                                logger=None):
    """validate a set of spectrogram files that will be used as a dataset.
    Validates that:
      - all files contain a spectrogram array that can be accessed with the specified key
      - the length of the frequency bin array in each file equals the number of rows in the spectrogram array
      - the frequency bins are the same across all files
      - the length of the time bin array in each file equals the number of columns in the spectrogram array
      - the duration of a spectrogram time bin is the same across all files

    Parameters
    ----------
    spect_paths: list
        of strings or pathlib.Path objects; paths to spectrogram files.
    spect_format : str
        format of files containing spectrograms. One of {'mat', 'npz'}
    freqbins_key : str
        key for accessing vector of frequency bins in files. Default is 'f'.
    timebins_key : str
        key for accessing vector of time bins in files. Default is 't'.
    spect_key : str
        key for accessing spectrogram in files. Default is 's'.
    n_decimals_trunc : int
        number of decimal places to keep when truncating the timebin duration calculated from
        the vector of time bins.
        Default is 3, i.e. assumes milliseconds is the last significant digit.

    Other Parameters
    ----------------
    logger : logging.Logger
        instance created by vak.logging.get_logger. Default is None.

    Returns
    -------
    returns True if all validation checks pass. If not, an error is raised.
    """
    spect_paths = [Path(spect_path) for spect_path in spect_paths]

    def _validate(spect_path):
        """validates each spectrogram file, then returns frequency bin array
        and duration of time bins, so that those can be validated across all files"""
        spect_dict = load(spect_path, spect_format)

        if spect_key not in spect_dict:
            raise KeyError(
                f"Did not find a spectrogram in file '{spect_path.name}' "
                f"using spect_key '{spect_key}'.")

        freq_bins = spect_dict[freqbins_key]
        time_bins = spect_dict[timebins_key]
        timebin_dur = timebin_dur_from_vec(time_bins, n_decimals_trunc)

        # number of freq. bins should equal number of rows
        if spect_dict[freqbins_key].shape[-1] != spect_dict[spect_key].shape[0]:
            raise ValueError(f'length of frequency bins in {spect_path.name} '
                             'does not match number of rows in spectrogram')
        # number of time bins should equal number of columns
        if spect_dict[timebins_key].shape[-1] != spect_dict[spect_key].shape[1]:
            raise ValueError(
                f'length of time_bins in {spect_path.name} '
                f'does not match number of columns in spectrogram')

        return spect_path, freq_bins, timebin_dur

    spect_paths_bag = db.from_sequence(spect_paths)

    log_or_print('validating set of spectrogram files',
                 logger=logger,
                 level='info')

    with ProgressBar():
        path_freqbins_timebin_dur_tups = list(spect_paths_bag.map(_validate))

    all_freq_bins = np.stack(
        [tup[1] for tup in path_freqbins_timebin_dur_tups])
    uniq_freq_bins = np.unique(all_freq_bins, axis=0)
    if len(uniq_freq_bins) != 1:
        raise ValueError(
            f'Found more than one frequency bin vector across files. '
            f'Instead found {len(uniq_freq_bins)}')

    timebin_durs = [tup[2] for tup in path_freqbins_timebin_dur_tups]
    uniq_durs = np.unique(timebin_durs)
    if len(uniq_durs) != 1:
        raise ValueError(
            'Found more than one duration for time bins across spectrogram files. '
            f'Durations found were: {uniq_durs}')

    return True
Esempio n. 7
0
def run_experiment(show_plot=True):

    if platform not in ['win32', 'win64']:
        raise Exception("Rectifier.fmu is only available for Windows")

    print("Parameter variation on %s:" % fmu_filename)
    print("  VAC", v_ac)
    print("  IDC", i_dc)

    if sync:
        dask.set_options(
            get=dask.dask.local.get_sync)  # synchronized scheduler

    # download the FMU
    download_test_file('2.0', 'CoSimulation', 'Dymola', '2017', 'Rectifier',
                       fmu_filename)

    # read the model description
    model_description = read_model_description(fmu_filename)

    # collect the value references for the variables to read / write
    vrs = {}
    for variable in model_description.modelVariables:
        vrs[variable.name] = variable.valueReference

    # extract the FMU
    unzipdir = fmpy.extract(fmu_filename)

    fmu_args = {
        'guid': model_description.guid,
        'modelIdentifier': model_description.coSimulation.modelIdentifier,
        'unzipDirectory': unzipdir
    }

    # get the value references for the start and output values
    start_vrs = [vrs['VAC'], vrs['IDC']]
    result_vrs = [vrs['uDC'], vrs['Losses']]

    indices = list(np.ndindex(I_DC.shape))

    chunks = []
    chunk_size = int(np.ceil(len(indices) / 10))

    # split the indices into 10 chunks
    for i in range(0, len(indices), chunk_size):
        chunks.append(
            [indices[i:i + chunk_size], fmu_args, start_vrs, result_vrs])

    print("Running %d simulations (%d chunks)..." % (V_AC.size, len(chunks)))
    with ProgressBar():
        # calculate the losses for every chunk
        results = bag.from_sequence(chunks).map(simulate_fmu).compute()

    LOSSES = np.zeros_like(V_AC)

    # put the results together
    for zipped, dll_handle in results:
        for i, res in zipped:
            LOSSES[i] = res[1]

    # unload the shared library
    if sync:
        while True:
            try:
                fmpy.freeLibrary(dll_handle)
            except:
                break

    # clean up
    shutil.rmtree(unzipdir)

    if show_plot:
        print("Plotting results...")

        import matplotlib.pyplot as plt

        figure = plt.figure()
        figure.patch.set_facecolor('white')
        ax = figure.add_subplot(1, 1, 1)

        CS = plt.contourf(V_AC, I_DC, LOSSES, 10)
        plt.colorbar(CS, aspect=30)

        CS = ax.contour(V_AC, I_DC, LOSSES, 10, colors='k', linewidths=0.8)
        ax.clabel(CS=CS, fmt='%.0f', fontsize=9, inline=1)

        ax.set_title('Losses / W')
        ax.set_xlabel('AC Voltage / V')
        ax.set_ylabel('DC Current / A')

        plt.show()
    else:
        print("Plotting disabled")

    print("Done.")

    return LOSSES
Esempio n. 8
0
def modify_doc(doc):
    """Add plots to the document

    Parameters
    ----------
    doc : [type]
        A Bokeh document to which plots can be added

    """
    curDir = os.path.dirname(__file__)
    root = tk.Tk()
    root.withdraw()

    ProgressBar().register()

    filepath = filedialog.askopenfilename()
    filename = filepath[filepath.rfind('/') + 1:filepath.rfind('.')]

    monteData = dd.read_csv(filepath)

    monteData.fillna(0)

    plotData = monteData.compute()

    gc.collect()

    plotData.drop_duplicates(subset='L-string', inplace=True)
    plotData.reset_index()
    for i in range(2, 6):
        plotData['{}-gram'.format(i)] = plotData['L-string'].apply(
            lambda x: [x[j:j + i] for j in range(0, len(x), i)])

    gc.collect()

    scatter = ColumnDataSource(data=plotData)
    line = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0]))
    rule1 = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0]))
    rule2 = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0]))
    polygon = ColumnDataSource(data=dict(x=[0], y=[0]))

    rule1_poly = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0]))
    rule2_poly = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0]))

    palette.reverse()

    mapper = log_cmap(field_name='Area', palette=palette, low=0, high=500)

    tooltips1 = [
        ('index', '$index'),
        ('F', '@{% of F}{0.0%}'),
        ('+', '@{% of +}{0.0%}'),
        ('-', '@{% of -}{0.0%}'),
    ]
    tooltips2 = [
        ('index', '$index'),
        ('F', '@{Longest F sequence}'),
        ('+', '@{Longest + sequence}'),
        ('-', '@{Longest - sequence}'),
    ]

    plots_width = 500
    plots_height = 500
    p1 = figure(plot_width=plots_width,
                plot_height=plots_height,
                tools='pan,wheel_zoom,box_zoom,reset,tap,save',
                title="Area",
                output_backend="webgl",
                tooltips=tooltips1)
    p1.xaxis.axis_label = 'Area'
    p1.yaxis.axis_label = '% of character'
    p1.scatter('Area',
               '% of F',
               size=7,
               source=scatter,
               color=mapper,
               alpha=0.6,
               nonselection_fill_color=mapper)

    p2 = figure(plot_width=plots_width,
                plot_height=plots_height,
                tools='pan,wheel_zoom,box_zoom,reset,tap,save',
                title="Area",
                output_backend="webgl",
                tooltips=tooltips2)
    p2.xaxis.axis_label = 'Area'
    p2.yaxis.axis_label = 'Length of sequence'
    p2.scatter('Area',
               'Longest F sequence',
               size=7,
               source=scatter,
               fill_color='red',
               color=mapper,
               alpha=0.6,
               nonselection_fill_color=mapper)

    p3 = figure(plot_width=plots_width,
                plot_height=plots_height,
                tools='pan,wheel_zoom,box_zoom,reset,tap,save',
                title="Selected Creature",
                output_backend="webgl")
    p3.axis.visible = False
    p3.grid.visible = False
    p3.line(x='x', y='y', line_color='red', source=line)
    p3.multi_polygons(xs='x', ys='y', source=polygon)

    p4 = figure(plot_width=plots_width,
                plot_height=plots_height,
                tools='pan,wheel_zoom,box_zoom,reset,tap,save',
                title="Area",
                output_backend="webgl")
    p4.scatter('Area',
               'Angle',
               size=7,
               source=scatter,
               color=mapper,
               alpha=0.6,
               nonselection_fill_color=mapper)
    p4.xaxis.axis_label = 'Area'
    p4.yaxis.axis_label = 'Angle (degrees)'

    p5 = figure(plot_width=plots_width,
                plot_height=plots_height // 2,
                title="Rule 1",
                output_backend="webgl")
    p5.line(x='x', y='y', line_color='red', source=rule1)
    p5.multi_polygons(xs='x', ys='y', source=rule1_poly)
    p5.axis.visible = False
    p5.grid.visible = False

    p6 = figure(plot_width=plots_width,
                plot_height=plots_height // 2,
                title="Rule 2",
                output_backend="webgl")
    p6.line(x='x', y='y', line_color='red', source=rule2)
    p6.multi_polygons(xs='x', ys='y', source=rule2_poly)
    p6.axis.visible = False
    p6.grid.visible = False

    L_string = Paragraph(text='Select creature', width=1500)

    grams = PreText(text='Select creature', width=400)
    rule_text = PreText(text='Select creature', width=400)

    area_label = Label(
        x=0,
        y=450,
        x_units='screen',
        y_units='screen',
        text='Select creature',
        render_mode='css',
        border_line_color='black',
        border_line_alpha=1.0,
        background_fill_color='white',
        background_fill_alpha=1.0,
    )

    length_label = Label(
        x=0,
        y=420,
        x_units='screen',
        y_units='screen',
        text='Select creature',
        render_mode='css',
        border_line_color='black',
        border_line_alpha=1.0,
        background_fill_color='white',
        background_fill_alpha=1.0,
    )

    p3.add_layout(area_label)
    p3.add_layout(length_label)

    def plot_source(coords):
        """[summary]

        Returns
        -------
        [type]
            [description]
        """
        instance_linestring = LineString(coords[:, 0:2])
        instance_patch = instance_linestring.buffer(0.5)
        instance_x, instance_y = instance_patch.exterior.coords.xy
        return instance_x, instance_y

    def mapper(string, angle):
        theta = 0

        num_chars = len(string)

        coords = np.zeros((num_chars + 1, 3), np.double)

        def makeRotMat(theta):
            rotMat = np.array(((cos(theta), -sin(theta), 0),
                               (sin(theta), cos(theta), 0), (0, 0, 1)))
            return rotMat

        rotVec = makeRotMat(theta)

        dir_vec = np.array((0, 1, 0), np.float64)
        i = 1

        for c in string:
            if c == 'F':
                coords[i] = (coords[i - 1] + (1 * dir_vec))
                i += 1

            if c == '-':
                theta = theta - angle
                rotVec = makeRotMat(theta)
                dir_vec = np.dot(rotVec, dir_vec)

            if c == '+':
                theta = theta + angle
                rotVec = makeRotMat(theta)
                dir_vec = np.dot(rotVec, dir_vec)

        coords = np.delete(coords, np.s_[i:], 0)
        return coords

    def plot_creature(event):
        line.data = dict(x=[0, 0], y=[0, 0])
        polygon.data = dict(x=[0, 0], y=[0, 0])
        rule1.data = dict(x=[0, 0], y=[0, 0])
        rule2.data = dict(x=[0, 0], y=[0, 0])
        rule1_poly.data = dict(x=[0, 0], y=[0, 0])
        rule2_poly.data = dict(x=[0, 0], y=[0, 0])
        L_string.text = 'Select creature'
        area_label.text = 'Select creature'
        length_label.text = 'Select creature'
        rule_text.text = 'Select creature'

        if len(scatter.selected.indices) > 0:

            creature_index = scatter.selected.indices[0]
            creature = plotData.iloc[creature_index, :]
            coords = np.array(ast.literal_eval(creature['Coordinates']))

            L_string.text = '{}'.format(creature['L-string'])
            area_label.text = 'Area: {:.2f}'.format(creature['Area'])
            length_label.text = 'Length of L-string: {}'.format(
                len(creature['L-string']))

            gram_frame_1 = pd.DataFrame.from_dict(
                {
                    '2-gram': creature['2-gram'],
                    '3-gram': creature['3-gram'],
                    '4-gram': creature['4-gram'],
                    '5-gram': creature['5-gram'],
                },
                orient='index').T

            counts = [
                pd.value_counts(
                    gram_frame_1[i]).reset_index().astype(str).apply(
                        ' '.join, 1) for i in gram_frame_1
            ]
            out = pd.concat(counts, 1).fillna('')
            out.columns = gram_frame_1.columns
            grams.text = str(tabulate(out, headers='keys'))

            creature_linestring = LineString(coords[:, 0:2])
            creature_patch = creature_linestring.buffer(0.5)
            patch_x, patch_y = creature_patch.exterior.coords.xy

            x_points = [list(patch_x)]
            y_points = [list(patch_y)]

            for i, _ in enumerate(creature_patch.interiors):
                x_in, y_in = creature_patch.interiors[i].coords.xy
                x_points.append(list(x_in))
                y_points.append(list(y_in))

            x_points = [[x_points]]
            y_points = [[y_points]]

            line.data = dict(x=coords[:, 0], y=coords[:, 1])
            polygon.data = dict(x=x_points, y=y_points)

            p3.match_aspect = True

            rules = ast.literal_eval(creature['Rules'])
            rules = rules['X']
            rules = rules['options']

            rule_text.text = 'Rule 1: \t' + \
                rules[0] + '\n' + 'Rule 2: \t' + rules[1]

            if any(char == 'F' for string in rules[0] for char in string):
                rule1_c = mapper(rules[0], creature['Angle'])

                rule1_morphology = LineString(rule1_c[:, 0:2])
                rule1_patch = rule1_morphology.buffer(0.5)
                rpatch_x, rpatch_y = rule1_patch.exterior.coords.xy

                r1_points_x = [list(rpatch_x)]
                r1_points_y = [list(rpatch_y)]

                for i, _ in enumerate(rule1_patch.interiors):
                    x_in, y_in = creature_patch.interiors[i].coords.xy
                    r1_points_x.append(list(x_in))
                    r1_points_y.append(list(y_in))

                r1_points_x = [[r1_points_x]]
                r1_points_y = [[r1_points_y]]

                rule1.data = dict(x=rule1_morphology.coords.xy[0],
                                  y=rule1_morphology.coords.xy[1])
                rule1_poly.data = dict(x=r1_points_x, y=r1_points_y)

                p5.match_aspect = True

            if any(char == 'F' for string in rules[1] for char in string):
                rule2_c = mapper(rules[1], creature['Angle'])

                rule2_morphology = LineString(rule2_c[:, 0:2])
                rule2_patch = rule2_morphology.buffer(0.5)
                r2patch_x, r2patch_y = rule2_patch.exterior.coords.xy

                r2_points_x = [list(r2patch_x)]
                r2_points_y = [list(r2patch_y)]

                for i, _ in enumerate(rule2_patch.interiors):
                    x_in, y_in = creature_patch.interiors[i].coords.xy
                    r2_points_x.append(list(x_in))
                    r2_points_y.append(list(y_in))

                r2_points_x = [[r2_points_x]]
                r2_points_y = [[r2_points_y]]

                rule2.data = dict(x=rule2_morphology.coords.xy[0],
                                  y=rule2_morphology.coords.xy[1])
                rule2_poly.data = dict(x=r2_points_x, y=r2_points_y)

                p6.match_aspect = True

        else:
            line.data = dict(x=[0, 0], y=[0, 0])
            polygon.data = dict(x=[0, 0], y=[0, 0])
            rule1.data = dict(x=[0, 0], y=[0, 0])
            rule2.data = dict(x=[0, 0], y=[0, 0])
            rule1_poly.data = dict(x=[0, 0], y=[0, 0])
            rule2_poly.data = dict(x=[0, 0], y=[0, 0])
            L_string.text = 'Select creature'
            area_label.text = 'Select creature'
            length_label.text = 'Select creature'
            rule_text.text = 'Select creature'

    p1.on_event(Tap, plot_creature)
    p2.on_event(Tap, plot_creature)
    p4.on_event(Tap, plot_creature)

    top_row = row(L_string)
    middle_row = row(p1, p2, p4)
    bottom_row_right = column(p5, p6)
    bottom_row_middle = column(grams, rule_text)
    bottom_row = row(p3, Spacer(width=50), bottom_row_middle, Spacer(width=50),
                     bottom_row_right)
    layout = column(top_row, middle_row, bottom_row)

    doc.add_root(layout)
Esempio n. 9
0
def lasso_tuning(alpha=[1e-15, 1e-10, 1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 1, 5, 10],
                 k=5,
                 train_data_path='../data/training_data.csv',
                 save_model=False,
                 tracking_uri="http://0.0.0.0:5000"):

    # Log the parameters with mlflow
    mlflow.log_param("alpha", alpha)
    mlflow.set_tag("k", k)

    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)

    # Get data shuffled and split into training and test sets
    mdr = MiningDataReader(path=train_data_path)
    (variable_names, X_train, X_test, y_train,
     y_test) = mdr.get_splitted_data()

    pipeline = Pipeline(steps=[(
        'scaling',
        StandardScaler()), ('regression', Lasso(random_state=RANDOM_SEED))])

    ### TRAINING ###
    ################

    # Generate grid search for hyperparam tuning
    hyperparams = {}
    hyperparams['regression__alpha'] = alpha

    print("Training started...\n")

    # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors
    modelCV = GridSearchCV(estimator=pipeline,
                           param_grid=hyperparams,
                           cv=k,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

    with ProgressBar():
        modelCV.fit(X_train, y_train)

    # Iterate over the results storing training error for each hyperparameter combination
    results = modelCV.cv_results_
    param_list, training_err_list, training_dev_list = [], [], []
    for i in range(len(results['params'])):
        param = results['params'][i]
        score = (-1) * results['mean_test_score'][i]  # NEGATIVE MSE
        std = results['std_test_score'][i]
        param_list.append(param)
        training_err_list.append(score)
        training_dev_list.append(std)

    print(
        f"\nBest parameter set found for the training set:\n{modelCV.best_params_}"
    )

    # Store the index of the best combination
    best_index = param_list.index(modelCV.best_params_)

    # Get the best values for hyperparams
    best_alpha = modelCV.best_params_['regression__alpha']

    print("\nTraining finished. Evaluating model...\n")

    ### EVALUATION ###
    ##################

    # Criteria is the number of trees
    criteria = 'alpha'
    mlflow.set_tag("criteria", criteria)
    param_values = alpha

    # Predict test data variying criteria param and evaluate the models
    training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], []
    rmse_score, mae_score, r2_score = -1, -1, -1
    feature_names, feature_importances = [], []
    for param_value in tqdm(param_values):
        model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    Lasso(alpha=param_value, random_state=RANDOM_SEED))])
        param = {'regression__alpha': param_value}

        # Fit model and evaluate results
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        index = param_list.index(param)
        training_err = training_err_list[index]
        training_dev = training_dev_list[index]
        (training_mse, test_mse, rmse, mae,
         r2) = get_test_metrics(training_err, y_test, prediction)
        # Store metrics
        training_err_by_criteria.append(training_mse)
        training_dev_by_criteria.append(training_dev)
        test_err_list.append(test_mse)
        # Set aditional metrics for the best combination
        if index == best_index:
            rmse_score = rmse
            mae_score = mae
            r2_score = r2

    # Generate the plots
    empty_img_folder()
    plot_errors(criteria, param_values, training_err_by_criteria,
                training_dev_by_criteria, test_err_list)

    # Once hyperparameters are selected, train and save the best model
    if save_model:
        print(
            "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..."
        )
        final_model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    Lasso(alpha=param_list[best_index]['regression__alpha']))])

        # Train the best model with all the data (training + test)
        full_X = np.vstack((X_train, X_test))
        full_y = np.concatenate((y_train, y_test))
        final_model.fit(full_X, full_y)

        # Log plots and model with mlflow
        mlflow.log_artifacts('./img')
        mlflow.sklearn.log_model(final_model, 'model')

    # Log results with mlflow
    mlflow.log_metric("train_mse", training_err_list[best_index])
    mlflow.log_metric("test_mse", min(test_err_list))
    mlflow.log_metric("rmse", rmse_score)
    mlflow.log_metric("mae", mae_score)
    mlflow.log_metric("r2", r2_score)
    mlflow.set_tag("best_params", param_list[best_index])

    # Output the results
    print(f'''
-----------------------------------------------------------------------------------------------------------------------
RESULTS
-----------------------------------------------------------------------------------------------------------------------
Best params: {param_list[best_index]}
Training MSE: {training_err_list[best_index]}
Test MSE: {min(test_err_list)}
RMSE: {rmse_score}
MAE: {mae_score}
R2: {r2_score}
-----------------------------------------------------------------------------------------------------------------------
''')
Esempio n. 10
0
def netcdf_to_ascii(homedir,
                    subdir,
                    source_directory,
                    mappingfile,
                    catalog_label,
                    meta_file,
                    temporal_resolution='D',
                    netcdfs=None,
                    variable_list=None):
    # initialize list of dataframe outputs
    outfiledict = {}

    # generate destination folder
    filedir = os.path.join(homedir, subdir)
    ogh.ensure_dir(filedir)

    # connect with collection of netcdfs
    if isinstance(netcdfs, type(None)):
        netcdfs = [
            os.path.join(source_directory, file)
            for file in os.listdir(source_directory) if file.endswith('.nc')
        ]
    ds_mf = xray.open_mfdataset(netcdfs, engine='netcdf4').sortby('TIME')

    # generate list of variables
    if not isinstance(variable_list, type(None)):
        ds_vars = variable_list.copy()
    else:
        ds_vars = [
            ds_var for ds_var in dict(ds_mf.variables).keys()
            if ds_var not in ['YEAR', 'MONTH', 'DAY', 'TIME', 'LAT', 'LON']
        ]

    # convert netcdfs to pandas.Panel API
    ds_pan = ds_mf.to_dataframe()[ds_vars]

    # read in gridded cells of interest
    maptable, nstation = ogh.mappingfileToDF(mappingfile,
                                             colvar=None,
                                             summary=False)

    # at each latlong of interest
    for ind, eachrow in maptable.iterrows():

        # generate ASCII time-series
        ds_df = ds_pan.loc[eachrow['LAT'],
                           eachrow['LONG_'], :].reset_index(drop=True,
                                                            level=[0, 1])

        # create file name
        outfilename = os.path.join(
            filedir, 'data_{0}_{1}'.format(eachrow['LAT'], eachrow['LONG_']))

        # save ds_df
        outfiledict[outfilename] = da.delayed(ds_df.to_csv)(
            path_or_buf=outfilename, sep='\t', header=False, index=False)

    # compute ASCII time-series files
    ProgressBar().register()
    outfiledict = da.compute(outfiledict)[0]

    # annotate metadata file
    meta_file[catalog_label] = dict(ds_mf.attrs)
    meta_file[catalog_label]['variable_list'] = list(np.array(ds_vars))
    meta_file[catalog_label]['delimiter'] = '\t'
    meta_file[catalog_label]['start_date'] = pd.Series(
        ds_mf.TIME).sort_values().iloc[0].strftime('%Y-%m-%d %H:%M:%S')
    meta_file[catalog_label]['end_date'] = pd.Series(
        ds_mf.TIME).sort_values().iloc[-1].strftime('%Y-%m-%d %H:%M:%S')
    meta_file[catalog_label]['temporal_resolution'] = temporal_resolution
    meta_file[catalog_label]['variable_info'] = dict(ds_mf.variables)

    # catalog the output files
    ogh.addCatalogToMap(outfilepath=mappingfile,
                        maptable=maptable,
                        folderpath=filedir,
                        catalog_label=catalog_label)
    os.chdir(homedir)
    return (list(outfiledict.keys()))
Esempio n. 11
0
def lr_deconvolution(image, psf, iterations=50):
    """
    Tiled Lucy-Richardson deconvolution using DECON_LIBRARY

    :param image: ndarray
        raw data
    :param psf: ndarray
        theoretical PSF
    :param iterations: int
        number of iterations to run 
    :return deconvolved: ndarray
        deconvolved image
    """

    # create dask array
    scan_chunk_size = 512
    if image.shape[0] < scan_chunk_size:
        dask_raw = da.from_array(image,
                                 chunks=(image.shape[0], image.shape[1],
                                         image.shape[2]))
        overlap_depth = (0, 2 * psf.shape[1], 2 * psf.shape[1])
    else:
        dask_raw = da.from_array(image,
                                 chunks=(scan_chunk_size, image.shape[1],
                                         image.shape[2]))
        overlap_depth = 2 * psf.shape[0]
    del image
    gc.collect()

    if DECON_LIBRARY == 'dexp':
        # define dask dexp partial function for GPU LR deconvolution
        lr_dask = partial(dexp_lr_decon,
                          psf=psf,
                          num_iterations=iterations,
                          padding=2 * psf.shape[0],
                          internal_dtype=np.float16)
    else:
        lr_dask = partial(mv_lr_decon, psf=psf, num_iterations=iterations)

    # create dask plan for overlapped blocks
    dask_decon = da.map_overlap(lr_dask,
                                dask_raw,
                                depth=overlap_depth,
                                boundary=None,
                                trim=True,
                                meta=np.array((), dtype=np.uint16))

    # perform LR deconvolution in blocks
    if DECON_LIBRARY == 'dexp':
        with CupyBackend(enable_cutensor=True,
                         enable_cub=True,
                         enable_fft_planning=True):
            with ProgressBar():
                decon_data = dask_decon.compute(scheduler='single-threaded')
    else:
        with ProgressBar():
            decon_data = dask_decon.compute(scheduler='single-threaded')

    # clean up memory
    cp.clear_memo()
    del dask_decon
    gc.collect()

    return decon_data.astype(np.uint16)
    def readData(self, runNumber=None, pulseIdInterval=None, path=None):
        """Read data by run number or macrobunch pulseID interval.

        Useful for scans that would otherwise hit the machine's memory limit.

        **Parameters**\n
        runNumber: int | None (default to ``self.runNumber``)
            number of the run from which to read data. If None, requires pulseIdInterval.
        pulseIdInterval: (int, int) | None (default to ``self.pulseIdInterval``)
            first and last macrobunches of selected data range. If None, the whole run
            defined by runNumber will be taken.
        path: str | None (default to ``self.DATA_RAW_DIR``)
            path to location where raw HDF5 files are stored.

        This is a union of the readRun and readInterval methods defined in previous versions.
        """

        # Update instance attributes based on input parameters
        if runNumber is None:
            runNumber = self.runNumber
        else:
            self.runNumber = runNumber

        if pulseIdInterval is None:
            pulseIdInterval = self.pulseIdInterval
        else:
            self.pulseIdInterval = pulseIdInterval

        if (pulseIdInterval is None) and (runNumber is None):
            raise ValueError('Need either runNumber or pulseIdInterval to know what data to read.')


        if path is not None:
            try:
                daqAccess = BeamtimeDaqAccess.create(path)
            except:
                self.path_to_run = misc.get_path_to_run(runNumber, path)
                daqAccess = BeamtimeDaqAccess.create(self.path_to_run)
        else:
            path = self.DATA_RAW_DIR
            self.path_to_run = misc.get_path_to_run(runNumber, path)
            daqAccess = BeamtimeDaqAccess.create(self.path_to_run)
        
        self.daqAddresses = []
        self.pulseIdInterval = self.getIds(runNumber, path)
        # Parse the settings file in the DAQ channels section for the list of
        # h5 addresses to read from raw and add to the dataframe.
        print('loading data...')

        for name, entry in self.settings['DAQ channels'].items():
            name = misc.camelCaseIt(name)
            val = str(entry)
            if daqAccess.isChannelAvailable(val, self.pulseIdInterval):
                self.daqAddresses.append(name)
                if _VERBOSE:
                    print('assigning address: {}: {}'.format(name.ljust(20), val))
                setattr(self, name, val)
            else:
                # if _VERBOSE:
                print('skipping address missing from data: {}: {}'.format(name.ljust(20), val))

        # TODO: get the available pulse id from PAH
        if pulseIdInterval is None:
            print('Reading DAQ data from run {}... Please wait...'.format(runNumber))

            for address_name in self.daqAddresses:
                if _VERBOSE:
                    print('reading address: {}'.format(address_name))
                try:
                    attrVal = getattr(self, address_name)
                    values, otherStuff = daqAccess.allValuesOfRun(attrVal, runNumber)
                except AssertionError:
                    print('Assertion error: {}'.format(address_name, attrVal, values, otherStuff ))

                setattr(self, address_name, values)
                if address_name == 'macroBunchPulseId':  # catch the value of the first macrobunchID
                    pulseIdInterval = (otherStuff[0], otherStuff[-1])
                    self.pulseIdInterval = pulseIdInterval
                    macroBunchPulseId_correction = pulseIdInterval[0]

                if address_name == 'timeStamp':  # catch the time stamps
                    startEndTime = (values[0,0], values[-1,0])
                    self.startEndTime = startEndTime

            numOfMacrobunches = pulseIdInterval[1] - pulseIdInterval[0]



        else:
            print('reading DAQ data from interval {}'.format(pulseIdInterval))
            self.pulseIdInterval = pulseIdInterval
            for address_name in self.daqAddresses:
                if _VERBOSE:
                    print('reading address: {}'.format(address_name))
                setattr(self, address_name, daqAccess.valuesOfInterval(getattr(self, address_name), pulseIdInterval))
            numOfMacrobunches = pulseIdInterval[1] - pulseIdInterval[0]
            macroBunchPulseId_correction = pulseIdInterval[0]

        # necessary corrections for specific channels:
        try:
            self.delayStage = self.delayStage[:, 1]
        except:
            try:
                self.delayStage = self.delayStage[:, 0]
                print('1030nm Laser')
            except:
                print('no delay stage')
        self.macroBunchPulseId -= macroBunchPulseId_correction
        self.dldMicrobunchId -= self.UBID_OFFSET

        if _VERBOSE:
            print('Counting electrons...')

        electronsToCount = self.dldPosX.copy().flatten()
        electronsToCount = np.nan_to_num(electronsToCount)
        electronsToCount = electronsToCount[electronsToCount > 0]
        electronsToCount = electronsToCount[electronsToCount < 10000]
        self.numOfElectrons = len(electronsToCount)
        self.electronsPerMacrobunch = int(self.numOfElectrons / numOfMacrobunches)

        self.runInfo = {
            'runNumber':self.runNumber,
            'pulseIdInterval':self.pulseIdInterval,
            'numberOfMacrobunches': numOfMacrobunches,
            'numberOfElectrons':self.numOfElectrons,
            'electronsPerMacrobunch': self.electronsPerMacrobunch,
        }
        try:
            self.runInfo['timestampStart'] = self.startEndTime[0].astype(int)
            self.runInfo['timestampStop'] = self.startEndTime[1].astype(int)
            self.runInfo['timestampDuration'] = self.startEndTime[1]-self.startEndTime[0].astype(int)
            self.runInfo['timeStart'] = datetime.utcfromtimestamp(self.startEndTime[0]).strftime('%Y-%m-%d %H:%M:%S')
            self.runInfo['timeStop'] = datetime.utcfromtimestamp(self.startEndTime[1]).strftime('%Y-%m-%d %H:%M:%S')
            self.runInfo['timeDuration'] = datetime.timedelta(self.startEndTime[1]-self.startEndTime[0])
        except:
            self.runInfo['timestampStart'] = None
            self.runInfo['timestampStop'] = None
            self.runInfo['timestampDuration'] = None
            self.runInfo['timeStart'] = None
            self.runInfo['timeStop'] = None
            self.runInfo['timeDuration'] = None

        self.printRunOverview()

        # Old Print style
        # print('Run {0} contains {1:,} Macrobunches, from {2:,} to {3:,}' \
        #       .format(runNumber, numOfMacrobunches, pulseIdInterval[0], pulseIdInterval[1]))
        # try:
        #     print("start time: {}, end time: {}, total time: {}"
        #           .format(datetime.utcfromtimestamp(startEndTime[0]).strftime('%Y-%m-%d %H:%M:%S'),
        #                   datetime.utcfromtimestamp(startEndTime[1]).strftime('%Y-%m-%d %H:%M:%S'),
        #                   datetime.utcfromtimestamp(startEndTime[1] - startEndTime[0]).strftime('%H:%M:%S')))
        # except:
        #     pass
        #
        # print("Number of electrons: {0:,}; {1:,} e/Mb ".format(self.numOfElectrons, self.electronsPerMacrobunch))

        print("Creating dataframes... Please wait...")
        with ProgressBar():
            self.createDataframePerElectron()
            print('Electron dataframe created.')
            self.createDataframePerMicrobunch()
            print('Microbunch dataframe created.')
            print('Reading Complete.')
Esempio n. 13
0
def test_no_tasks(capsys):
    with ProgressBar():
        get({'x': 1}, 'x')
    check_bar_completed(capsys)
Esempio n. 14
0
def test_minimum_time(capsys):
    with ProgressBar(1.0):
        out = get(dsk, 'e')
    out, err = capsys.readouterr()
    assert out == '' and err == ''
Esempio n. 15
0
def predict(args):
    # Numpy arrays

    # Convert source data into dask arrays
    radec, stokes = parse_sky_model(args.sky_model)
    radec = da.from_array(radec, chunks=(SOURCE_CHUNKS, 2))
    stokes = da.from_array(stokes, chunks=(SOURCE_CHUNKS, 4))

    # Get the support tables
    tables = support_tables(args, ["FIELD", "DATA_DESCRIPTION",
                                   "SPECTRAL_WINDOW", "POLARIZATION"])

    field_ds = tables["FIELD"]
    ddid_ds = tables["DATA_DESCRIPTION"]
    spw_ds = tables["SPECTRAL_WINDOW"]
    pol_ds = tables["POLARIZATION"]

    # List of write operations
    writes = []

    # Construct a graph for each DATA_DESC_ID
    for xds in xds_from_ms(args.ms,
                           columns=["UVW", "ANTENNA1", "ANTENNA2", "TIME"],
                           group_cols=["FIELD_ID", "DATA_DESC_ID"],
                           chunks={"row": args.row_chunks}):

        # Extract frequencies from the spectral window associated
        # with this data descriptor id
        field = field_ds[xds.attrs['FIELD_ID']]
        ddid = ddid_ds[xds.attrs['DATA_DESC_ID']]
        spw = spw_ds[ddid.SPECTRAL_WINDOW_ID.values]
        pol = pol_ds[ddid.POLARIZATION_ID.values]
        frequency = spw.CHAN_FREQ.data

        corrs = pol.NUM_CORR.values

        lm = radec_to_lm(radec, field.PHASE_DIR.data)
        uvw = -xds.UVW.data if args.invert_uvw else xds.UVW.data

        # (source, row, frequency)
        phase = phase_delay(lm, uvw, frequency)

        brightness = convert(stokes, ["I", "Q", "U", "V"],
                             corr_schema(pol))

        # (source, row, frequency, corr1, corr2)
        jones = da.einsum(einsum_schema(pol), phase, brightness)

        # Identify time indices
        _, time_index = da.unique(xds.TIME.data, return_inverse=True)

        # Predict visibilities
        vis = predict_vis(time_index, xds.ANTENNA1.data, xds.ANTENNA2.data,
                          None, jones, None, None, None, None)

        # Reshape (2, 2) correlation to shape (4,)
        if corrs == 4:
            vis = vis.reshape(vis.shape[:2] + (4,))

        # Assign visibilities to MODEL_DATA array on the dataset
        model_data = xr.DataArray(vis, dims=["row", "chan", "corr"])
        xds = xds.assign(MODEL_DATA=model_data)
        # Create a write to the table
        write = xds_to_table(xds, args.ms, ['MODEL_DATA'])
        # Add to the list of writes
        writes.append(write)

    # Submit all graph computations in parallel
    with ProgressBar():
        dask.compute(writes)
def predict_xr(model, input_xr, progress=True):
    """
    Utilise our wrappers to predict with a vanilla sklearn model.

    Last modified: September 2019

    Parameters
    ----------
    model : a scikit-learn model or compatible object
        Must have a predict() method that takes numpy arrays.
    input_xr : xarray.DataArray or xarray.Dataset
        Must have dimensions 'x' and 'y', may have dimension 'time'.

    Returns
    ----------
    output_xr : xarray.DataArray 
        An xarray.DataArray containing the prediction output from model 
        with input_xr as input. Has the same spatiotemporal structure 
        as input_xr.

    """
    def _get_class_ufunc(*args):
        """
        ufunc to apply classification to chunks of data
        """
        input_data_flattened = []
        for data in args:
            input_data_flattened.append(data.flatten())

        # Flatten array
        input_data_flattened = np.array(input_data_flattened).transpose()

        # Mask out no-data in input (not all classifiers can cope with
        # Inf or NaN values)
        input_data_flattened = np.where(np.isfinite(input_data_flattened),
                                        input_data_flattened, 0)

        # Actually apply the classification
        out_class = model.predict(input_data_flattened)

        # Mask out NaN or Inf values in results
        out_class = np.where(np.isfinite(out_class), out_class, 0)

        # Reshape when writing out
        return out_class.reshape(args[0].shape)

    def _get_class(*args):
        """
        Apply classification to xarray DataArrays.

        Uses dask to run chunks at a time in parallel

        """
        out = xr.apply_ufunc(_get_class_ufunc,
                             *args,
                             dask='parallelized',
                             output_dtypes=[np.uint8])

        return out

    # Set up a list of input data using variables passed in
    input_data = []

    for var_name in input_xr.data_vars:
        input_data.append(input_xr[var_name])

    # Run through classification. Need to expand and have a separate
    # dataframe for each variable so chunking in dask works.
    if progress:
        with ProgressBar():
            out_class = _get_class(*input_data).compute()
    else:
        out_class = _get_class(*input_data).compute()

    # Set the stacked coordinate to match the input
    output_xr = xr.DataArray(out_class, coords=input_xr.coords)

    return output_xr
Esempio n. 17
0
def analyze_samples(data, features, copy=False):
    """Calculate the set of specified `features` for every sample, defined as the set of
    molecules corresponding to every cell-gene pair.

    Parameters
    ----------
    data : AnnData
        Spatially formatted AnnData
    features : list of :class:`SampleFeature`
        List of :class:`SampleFeature` to compute.
    chunks : int, optional
        Number of partitions to use, passed to `dask`, by default None.
    chunksize : int, optional
        Size of partitions, passed to `dask`, by default None.
    copy : bool
        Return a copy of `data` instead of writing to data, by default False.

    Returns
    -------
    adata : anndata.AnnData
        Returns `adata` if `copy=True`, otherwise adds fields to `data`:
        `.layers[`keys`]`
            See the output of each :class:`SampleFeature` in `features` for keys added.
    """
    adata = data.copy() if copy else data

    pbar = tqdm(desc="Cell features", total=3)
    # Cast features to type list
    if not isinstance(features, list):
        features = [features]
    features = [sample_features[f] for f in features]

    cell_features = set()  # Cell-level fns to run
    cell_attributes = set(
    )  # Cell-level attributes needed to compute sample features
    for f in features:
        cell_features.update(f.cell_features)
        cell_attributes.update(f.cell_attributes)

    cell_features = list(cell_features)
    cell_attributes = list(cell_attributes)

    tl.analyze_cells(adata, cell_features, progress=False)

    # Make sure attributes are present
    attrs_found = set(cell_attributes).intersection(
        set(adata.obs.columns.tolist()))
    if len(attrs_found) != len(cell_attributes):
        raise KeyError(f"df does not have all columns: {cell_attributes}.")

    pbar.update()

    pbar.set_description("Sample features")
    # extract cell attributes
    points_df = (get_points(adata, asgeo=True).set_index("cell").join(
        data.obs[cell_attributes]).reset_index().sort_values(
            ["cell", "gene"]).reset_index(drop=True))

    # Handle categories as strings to avoid ambiguous cat types
    for col in points_df.loc[:, (points_df.dtypes == 'category').values]:
        points_df[col] = points_df[col].astype(str)

    # Handle shape indexes as strings to avoid ambiguous types
    for shape_name in adata.obs.columns[adata.obs.columns.str.endswith(
            '_shape')]:
        shape_prefix = '_'.join(shape_name.split('_')[:-1])
        if shape_prefix in points_df.columns:
            points_df[shape_prefix] = points_df[shape_prefix].astype(str)

    # Calculate features for a sample
    def process_sample(df):
        sample_output = {}
        for f in features:
            sample_output.update(f.extract(df))
        return sample_output

    # Process all samples in a partition
    def process_partition(partition_df):
        return partition_df.groupby(["cell", "gene"],
                                    observed=True).apply(process_sample)

    # Cast to dask dataframe
    ddf = dask_geopandas.from_geopandas(points_df, npartitions=1)

    # Partition so only 1000 groups per groupby
    _, group_loc = np.unique(
        points_df["cell"].astype(str) + "-" + points_df["gene"].astype(str),
        return_index=True,
    )
    divisions = [group_loc[loc] for loc in range(0, len(group_loc), 1000)]
    divisions.append(len(points_df) - 1)
    ddf = ddf.repartition(divisions=divisions)

    # Parallel process each partition
    with ProgressBar():
        # Run on a single sample to get output metadata
        meta_output = process_partition(points_df.head())
        meta = pd.DataFrame(meta_output.tolist(), index=meta_output.index)
        output = ddf.map_partitions(process_partition,
                                    meta=meta.dtypes).compute()

    pbar.update()
    pbar.set_description("Saving to AnnData")

    # Format from Series of dicts to DataFrame
    output = pd.DataFrame(output.tolist(), index=output.index).reset_index()

    # Save results to data layers
    feature_names = output.columns[~output.columns.isin(["cell", "gene"])]
    for feature_name in feature_names:
        adata.layers[feature_name] = (output.pivot(
            index="cell", columns="gene", values=feature_name).reindex(
                index=adata.obs_names, columns=adata.var_names).astype(float))

    pbar.update()
    pbar.set_description('Done!')
    pbar.close()
Esempio n. 18
0
def sumTokenCounts(stores,data):
	max_str_bytes = 50
	chunksize = 100000
	batch_limit = 6*10**8
	savestore = data + "final/fromnodes-323.h5"

	for storefile in stores:
		print(storefile)
		logging.info("Next store: %s" % storefile)
		try:
			# Get Unique languages
			with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store:
				langs = set([key.split("/", maxsplit=-1)[-1] for key in store.keys() if 'merged1' in key])
		except:
			logging.exception("Can't read languages from %s" % storefile)
			continue

		for lang in langs:
			batch = False
			logging.info("Starting lang %s from %s" % (lang, storefile))
			print(lang)

			if not re.match('[a-z]{3}', lang):
				logging.error("lang '%s' is not three alphanumeric characters. Skipping for now. (%s)" % (lang, storefile))
				continue

			try:
				ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize, mode='r')
			except:
				logging.exception("Can't load Dask DF for %s in %s" % (lang, storefile))
				continue

			# Assuming partitions are equally sized, which they should be if read from a single file
			if ddf.npartitions > np.ceil(batch_limit/chunksize):
				batch = True
				niters = np.floor((ddf.npartitions*chunksize)/batch_limit)
				i = 0

			while True:
				if batch:
					start = i * batch_limit
					logging.info("Starting batch %d for %s" % (i, lang))
					if i == niters:
						# Last batch, no stop value
						ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize, start=start)
						batch = False
					else:
						ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize,start=start, stop=(start+batch_limit))
						i += 1
				try:
					logging.info("Starting full merge for %s with %d partitions" % (lang, ddf.npartitions))
					with ProgressBar():
						full_merge = ddf.reset_index().groupby('token').sum().compute()
					#if lang == 'eng':
						# For curiosity: see the profiling for English
					#    prof.visualize()
					logging.info("Success! Saving merged.")
					# The /fromnodes table is the sum from all the different stores, but will need to be summed one more time
					with pd.HDFStore(savestore, complevel=9, mode="a", complib='blosc') as store:
						store.append(lang,full_merge,data_columns=['count'],min_itemsize = {'index': max_str_bytes})
				except:
					logging.exception("Can't compute or save lang for %s in %s" % (lang, storefile))

				if batch == False:
					break
Esempio n. 19
0
    def to_dask(self, pages=None, persist=False, progress=True):
        try:
            import dask

        except ImportError:
            raise RuntimeError("Dask is not installed.")
        if progress:
            from dask.diagnostics import ProgressBar
            ProgressBar().register()
        if pages is None:
            pages = self.page_numbers
        columns = [(k, DASK_TYPE_MAPPING[v.get("type", 'string')])
                   for k, v in self.schema.items()
                   if k in self.fields and not k.startswith("_")]
        column_types = dict(columns)

        url = self._url
        client_kwargs = self.session.get_client_kwargs()
        if client_kwargs["app"] is not None:
            client_kwargs["app"] = dict(client_kwargs["app"].config)

        def get_data(params):
            import httpx
            if client_kwargs["app"] is not None:
                from eve import Eve
                client_kwargs["app"] = Eve(settings=client_kwargs["app"])
            items = []
            with httpx.Client(**client_kwargs) as client:
                try:
                    resp = client.get(
                        url,
                        params=params,
                    )
                    items = resp.json().get("_items", [])
                except:
                    pass
            data = [{
                k: column_types[k](v)
                for k, v in item.items() if k in column_types
            } for item in items]
            return data

        if not self.is_tabular:
            import dask.bag as db
            return db.from_sequence([self.get_page_kwargs(i)
                                     for i in pages]).map(get_data).flatten()

        import dask.dataframe as dd
        import pandas as pd

        def get_df(params):
            data = get_data(params)
            return pd.DataFrame(data, columns=list(column_types))

        dask_name = str(
            hash((self.name, ) + tuple(self.get_page_kwargs(1).values())))
        dsk = {(dask_name, i - 1): (get_df, self.get_page_kwargs(i))
               for i in pages}

        nitems = self.nitems
        divisions = list(range(0, nitems, self.items_per_page))
        if nitems not in divisions:
            divisions = divisions + [nitems]

        df = dd.DataFrame(dsk, dask_name, columns, divisions)
        if persist:
            return df.persist()
        return df
Esempio n. 20
0
def _load_basic_dataframe(df_file=None,
                          datatype='sim',
                          config='IC86.2012',
                          energy_reco=True,
                          energy_cut_key='reco_log_energy',
                          log_energy_min=None,
                          log_energy_max=None,
                          columns=None,
                          n_jobs=1,
                          verbose=False,
                          compute=True):

    validate_datatype(datatype)

    if df_file is not None:
        files = df_file
    else:
        paths = get_config_paths()
        file_pattern = os.path.join(paths.comp_data_dir, config, datatype,
                                    'processed_hdf',
                                    'nominal' if datatype == 'sim' else '',
                                    '*.hdf')
        files = sorted(glob.glob(file_pattern))

    ddf = dd.read_hdf(files,
                      key='dataframe',
                      mode='r',
                      columns=columns,
                      chunksize=10000)

    # Energy reconstruction
    if energy_reco:
        model_dict = load_trained_model(
            'linearregression_energy_{}'.format(config), return_metadata=True)
        pipeline = model_dict['pipeline']
        feature_list = list(model_dict['training_features'])

        def add_reco_energy(partition):
            partition['reco_log_energy'] = pipeline.predict(
                partition[feature_list])
            partition['reco_energy'] = 10**partition['reco_log_energy']
            return partition

        ddf = ddf.map_partitions(add_reco_energy)

    # Energy range cut
    if log_energy_min is not None and log_energy_max is not None:

        def apply_energy_cut(partition):
            energy_mask = (partition[energy_cut_key] > log_energy_min) & (
                partition[energy_cut_key] < log_energy_max)
            return partition.loc[energy_mask, :]

        ddf = ddf.map_partitions(apply_energy_cut)

    if compute:
        if verbose:
            pbar = ProgressBar()
            pbar.register()
        scheduler = 'processes' if n_jobs > 1 else 'synchronous'
        df = ddf.compute(scheduler=scheduler, num_workers=n_jobs)
        df = df.reset_index(drop=True)
    else:
        df = ddf

    return df
Esempio n. 21
0
File: tof.py Progetto: hperrey/tof
def cook_data(filepath,
              threshold,
              maxamp,
              Nchannel,
              Ychannel,
              outpath="",
              model_path="",
              CNN_window=300,
              baseline_int_window=20,
              lg_baseline_offset=0,
              sg_baseline_offset=0,
              frac=0.3,
              lg=0,
              sg=0,
              cleanUp=False,
              blocksize=25 * 10**6,
              repatition_factor=16):
    """Uses dask to process the txt output of WaveDump on all available logical cores and return a simple dataframe in parquet format
    \nfilepath = Path to file. Use * as a wildcard to read multiple textfile: e.g. file*.txt, will read file1.txt, file2.txt, file3.txt, etc into the same dataframe.
    \nthreshold = Wavedump triggers on all channels when one channel triggers, so to throw away empty events we must reenforce the threshold.
    \nmaxamp = Max amplitude varies with the offset used in the wavedump config file. If a pulse reaches the maxAmp then we want to throw it away as it is likely to have some part cut off.
    \nNchannel, Ychannel = the channel numbers in which neutron and gamma detectors are placed
    \noutpath = The path where the resulting dataframe is stored.
    \nbaseline_integration_window =20 integer number of bins used in baseline determination.
    \nlg/sg_baseline_offset = the baseline offset we use when integrating pulses, in order to compensate for underflow, and in order to rotate or linearize psd spectrum
    \nfine_baseline_offset: The baseline is forced to be an integer. The non integer part is multiplied by 1000 and cast to an int for later use in pulse integration.
    \ncleanUp: Boolean, wether to write events that \'failed\' for various reason (cfd trig fail or wobbly baseline). These events will be a small fraction, provided a reasonable threshold was applied. By default this parameter is false since they can be filter out using query(\'invalid==False\'), and are useful for debugging and only take up a little space.
    \nfrac=0.3 = the fraction of peak amplitude used in the cfd algorithm.
    \nlg=200 = the width of the longgate integration window in nanoseconds,
    \nsg=22 = width of the shortgate integration window in nanoseconds,
    \nblocksize=25*10**6 = The amount of data in bytes that will be processed on each thread/logica core. Experiment to find a value that works for your machine specs. Likely it will be between 10 and 100 MB"""
    filesize = os.stat(filepath).st_size
    Nblocks = int(round(0.5 + (filesize / blocksize / repatition_factor)))
    print('processing ', filesize, ' bytes. Will generate', Nblocks, ' blocks')
    print('Generating lazy instructions.')

    #==================#
    # Read in the file #
    #==================#
    df = dd.read_csv(filepath,
                     header=None,
                     usecols=[0, 2, 3, 5, 7],
                     names=[
                         'window_width', 'channel', 'event_number',
                         'timestamp', 'samples'
                     ],
                     dtype={
                         'window_width': np.int32,
                         'channel': np.int8,
                         'event_number': np.int64,
                         'timestamp': np.int64,
                         'samples': np.object
                     },
                     blocksize=blocksize)

    #====================#
    # Format the samples #
    #====================#
    #first convert the string into an integer array. Then subtract the baseline.
    df['samples'] = df['samples'].str.split().apply(
        lambda x: np.array(x, dtype=np.int16), meta=df['samples'])
    df['samples'] = df['samples'].apply(
        lambda x: x - int(round(np.average(x[0:baseline_int_window]))),
        meta=df['samples'])
    #The baseline is forced to be an integer. The non integer part is multiplied by 1000 and cast to an int for later use in pulse integration.
    df['fine_baseline_offset'] = np.int16(0)
    df['fine_baseline_offset'] = df.apply(lambda x: int(
        0.5 + 1000 * np.average(x.samples[0:baseline_int_window])),
                                          meta=df['fine_baseline_offset'],
                                          axis=1)

    #====================================#
    # Get amplitude and location of peak #
    #====================================#
    df['amplitude'] = df['samples'].apply(lambda x: np.max(np.absolute(x)),
                                          meta=df['samples']).astype(np.int16)
    df['peak_index'] = df['samples'].apply(np.argmin,
                                           meta=df['samples']).astype(np.int16)

    #====================#
    # Pulse integrations #
    #====================#
    # offsetting each bin by a certain baseline offset is equivalent to adding the product
    # of the integration window and the baseline offset to the integration.
    df = pulse_integration(df, lg, sg)

    #=======================#
    # generate cfd triggers #
    #=======================#
    df['cfd_trig_rise'] = np.int32(0)
    df['cfd_trig_rise'] = df.apply(lambda x: cfd(x, frac=0.3),
                                   meta=df['cfd_trig_rise'],
                                   axis=1)

    #===================#
    # Handle bad events #
    #===================#
    #Throw away events whose amplitude is below the threshold
    #df = df[df['amplitude'] >= ch_thr_mask[df['channel']]]
    df = df[df['amplitude'] >= threshold]
    #And those whose amplitude is greater than the expected maximum amplitude (likely have their tops cut off)
    df['cutoff'] = False
    df['cutoff'] = df['cutoff'].where(df['amplitude'] < maxamp, True)
    #df = df[maxamp > df['amplitude']]
    #and those whose baseline jitters too much.
    df['baseline_std'] = np.float64(0)
    df['baseline_std'] = df['samples'].apply(
        lambda x: np.std(x[0:baseline_int_window]), meta=df['baseline_std'])
    #df = df[df['baseline_std'] < 2]
    df['wobbly_baseline'] = False
    df['wobbly_baseline'] = df['wobbly_baseline'].where(
        df['baseline_std'] < 2, True)
    #and those where the cfd triggering failed.cfd_trig_rise = -1 implies error. This occurs if the first bin is
    #above the cfd trigger point. We also want to ensure that the cfd trigger happens after the baseline determination
    #and with enough bins following to allow proper lg integration.
    df['cfd_too_early'] = False
    df['cfd_too_early'] = df['cfd_too_early'].where(
        df['cfd_trig_rise'] / 1000 > baseline_int_window, True)
    #df = df[baseline_int_window*1000 < df['cfd_trig_rise']] #ensure baseline int window
    df['cfd_too_late_lg'] = False
    df['cfd_too_late_lg'] = df['cfd_too_late_lg'].where(
        df['cfd_trig_rise'] / 1000 < (df['window_width'] - lg), True)
    #df = df[df['cfd_trig_rise'] < 1000*(df['window_width']-lg)] # ensure lg integration window

    #===========================#
    #Time of Flight correlations#
    #===========================#
    shift = int((Nchannel - Ychannel) / abs(Nchannel - Ychannel))
    df = get_tof(df, Nchannel, Ychannel, shift)

    #=======================================#
    #Convolutional neural network prediction#
    #=======================================#
    df['cfd_too_late_CNN'] = False
    if (model_path):
        df = cnn_discrim(df, model_path, CNN_window)

    #General Goodness parameter
    df['invalid'] = df['cutoff'] | df['wobbly_baseline'] | df[
        'cfd_too_early'] | df['cfd_too_late_lg'] | df['cfd_too_late_CNN']

    #Throw away or keep bad events? I recommend keeping bad events. They can be useful for debugging, and you can choose
    #not to load them later by choosing 'mode' in load_data_frame() function.
    if cleanUp == True:
        df = df[df['invalid'] == False]

    with ProgressBar():
        if (outpath):
            #repartition the dataframe into fewer (and larger) blocks
            df = df.repartition(npartitions=df.npartitions //
                                repatition_factor)
            #save to disk
            print('Processing dataframe and saving to disk')
            df.to_parquet(outpath, engine='pyarrow', compression='snappy')
    return df
Esempio n. 22
0
def main(argv=sys.argv[1:]):
    global LOG
    from satpy import Scene
    from satpy.resample import get_area_def
    from satpy.writers import compute_writer_results
    from dask.diagnostics import ProgressBar
    from polar2grid.core.script_utils import (
        setup_logging, rename_log_file, create_exc_handler)
    import argparse
    prog = os.getenv('PROG_NAME', sys.argv[0])
    # "usage: " will be printed at the top of this:
    usage = """
    %(prog)s -h
see available products:
    %(prog)s -r <reader> -w <writer> --list-products -f file1 [file2 ...]
basic processing:
    %(prog)s -r <reader> -w <writer> [options] -f file1 [file2 ...]
basic processing with limited products:
    %(prog)s -r <reader> -w <writer> [options] -p prod1 prod2 -f file1 [file2 ...]
"""
    parser = argparse.ArgumentParser(prog=prog, usage=usage,
                                     description="Load, composite, resample, and save datasets.")
    parser.add_argument('-v', '--verbose', dest='verbosity', action="count", default=0,
                        help='each occurrence increases verbosity 1 level through ERROR-WARNING-INFO-DEBUG (default INFO)')
    parser.add_argument('-l', '--log', dest="log_fn", default=None,
                        help="specify the log filename")
    parser.add_argument('--progress', action='store_true',
                        help="show processing progress bar (not recommended for logged output)")
    parser.add_argument('--num-workers', type=int, default=4,
                        help="specify number of worker threads to use (default: 4)")
    parser.add_argument('--match-resolution', dest='preserve_resolution', action='store_false',
                        help="When using the 'native' resampler for composites, don't save data "
                             "at its native resolution, use the resolution used to create the "
                             "composite.")
    parser.add_argument('-w', '--writers', nargs='+',
                        help='writers to save datasets with')
    parser.add_argument("--list-products", dest="list_products", action="store_true",
                        help="List available reader products and exit")
    subgroups = add_scene_argument_groups(parser)
    subgroups += add_resample_argument_groups(parser)

    argv_without_help = [x for x in argv if x not in ["-h", "--help"]]
    args, remaining_args = parser.parse_known_args(argv_without_help)

    # get the logger if we know the readers and writers that will be used
    if args.reader is not None and args.writers is not None:
        glue_name = args.reader + "_" + "-".join(args.writers or [])
        LOG = logging.getLogger(glue_name)
    # add writer arguments
    if args.writers is not None:
        for writer in (args.writers or []):
            parser_func = WRITER_PARSER_FUNCTIONS.get(writer)
            if parser_func is None:
                continue
            subgroups += parser_func(parser)
    args = parser.parse_args(argv)

    if args.reader is None:
        parser.print_usage()
        parser.exit(1, "\nERROR: Reader must be provided (-r flag).\n"
                       "Supported readers:\n\t{}\n".format('\n\t'.join(['abi_l1b', 'ahi_hsd', 'hrit_ahi'])))
    if args.writers is None:
        parser.print_usage()
        parser.exit(1, "\nERROR: Writer must be provided (-w flag) with one or more writer.\n"
                       "Supported writers:\n\t{}\n".format('\n\t'.join(['geotiff'])))

    def _args_to_dict(group_actions):
        return {ga.dest: getattr(args, ga.dest) for ga in group_actions if hasattr(args, ga.dest)}
    scene_args = _args_to_dict(subgroups[0]._group_actions)
    load_args = _args_to_dict(subgroups[1]._group_actions)
    resample_args = _args_to_dict(subgroups[2]._group_actions)
    writer_args = {}
    for idx, writer in enumerate(args.writers):
        sgrp1, sgrp2 = subgroups[3 + idx * 2: 5 + idx * 2]
        wargs = _args_to_dict(sgrp1._group_actions)
        if sgrp2 is not None:
            wargs.update(_args_to_dict(sgrp2._group_actions))
        writer_args[writer] = wargs
        # get default output filename
        if 'filename' in wargs and wargs['filename'] is None:
            wargs['filename'] = get_default_output_filename(args.reader, writer)

    if not args.filenames:
        parser.print_usage()
        parser.exit(1, "\nERROR: No data files provided (-f flag)\n")

    # Prepare logging
    rename_log = False
    if args.log_fn is None:
        rename_log = True
        args.log_fn = glue_name + "_fail.log"
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    setup_logging(console_level=levels[min(3, args.verbosity)], log_filename=args.log_fn)
    logging.getLogger('rasterio').setLevel(levels[min(2, args.verbosity)])
    sys.excepthook = create_exc_handler(LOG.name)
    if levels[min(3, args.verbosity)] > logging.DEBUG:
        import warnings
        warnings.filterwarnings("ignore")
    LOG.debug("Starting script with arguments: %s", " ".join(sys.argv))

    # Set up dask and the number of workers
    if args.num_workers:
        from multiprocessing.pool import ThreadPool
        dask.config.set(pool=ThreadPool(args.num_workers))

    # Parse provided files and search for files if provided directories
    scene_args['filenames'] = get_input_files(scene_args['filenames'])
    # Create a Scene, analyze the provided files
    LOG.info("Sorting and reading input files...")
    try:
        scn = Scene(**scene_args)
    except ValueError as e:
        LOG.error("{} | Enable debug message (-vvv) or see log file for details.".format(str(e)))
        LOG.debug("Further error information: ", exc_info=True)
        return -1
    except OSError:
        LOG.error("Could not open files. Enable debug message (-vvv) or see log file for details.")
        LOG.debug("Further error information: ", exc_info=True)
        return -1

    if args.list_products:
        print("\n".join(sorted(scn.available_dataset_names(composites=True))))
        return 0

    # Rename the log file
    if rename_log:
        rename_log_file(glue_name + scn.attrs['start_time'].strftime("_%Y%m%d_%H%M%S.log"))

    # Load the actual data arrays and metadata (lazy loaded as dask arrays)
    if load_args['products'] is None:
        try:
            reader_mod = importlib.import_module('polar2grid.readers.' + scene_args['reader'])
            load_args['products'] = reader_mod.DEFAULT_PRODUCTS
            LOG.info("Using default product list: {}".format(load_args['products']))
        except (ImportError, AttributeError):
            LOG.error("No default products list set, please specify with `--products`.")
            return -1

    LOG.info("Loading product metadata from files...")
    scn.load(load_args['products'])

    resample_kwargs = resample_args.copy()
    areas_to_resample = resample_kwargs.pop('grids')
    grid_configs = resample_kwargs.pop('grid_configs')
    resampler = resample_kwargs.pop('resampler')

    if areas_to_resample is None and resampler in [None, 'native']:
        # no areas specified
        areas_to_resample = ['MAX']
    elif areas_to_resample is None:
        raise ValueError("Resampling method specified (--method) without any destination grid/area (-g flag).")
    elif not areas_to_resample:
        # they don't want any resampling (they used '-g' with no args)
        areas_to_resample = [None]

    has_custom_grid = any(g not in ['MIN', 'MAX', None] for g in areas_to_resample)
    if has_custom_grid and resampler == 'native':
        LOG.error("Resampling method 'native' can only be used with 'MIN' or 'MAX' grids "
                  "(use 'nearest' method instead).")
        return -1

    p2g_grid_configs = [x for x in grid_configs if x.endswith('.conf')]
    pyresample_area_configs = [x for x in grid_configs if not x.endswith('.conf')]
    if not grid_configs or p2g_grid_configs:
        # if we were given p2g grid configs or we weren't given any to choose from
        from polar2grid.grids import GridManager
        grid_manager = GridManager(*p2g_grid_configs)
    else:
        grid_manager = {}

    if pyresample_area_configs:
        from pyresample.utils import parse_area_file
        custom_areas = parse_area_file(pyresample_area_configs)
        custom_areas = {x.area_id: x for x in custom_areas}
    else:
        custom_areas = {}

    ll_bbox = resample_kwargs.pop('ll_bbox')
    if ll_bbox:
        scn = scn.crop(ll_bbox=ll_bbox)

    wishlist = scn.wishlist.copy()
    preserve_resolution = get_preserve_resolution(args, resampler, areas_to_resample)
    if preserve_resolution:
        preserved_products = set(wishlist) & set(scn.datasets.keys())
        resampled_products = set(wishlist) - preserved_products

        # original native scene
        to_save = write_scene(scn, args.writers, writer_args, preserved_products)
    else:
        preserved_products = set()
        resampled_products = set(wishlist)
        to_save = []

    LOG.debug("Products to preserve resolution for: {}".format(preserved_products))
    LOG.debug("Products to use new resolution for: {}".format(resampled_products))
    for area_name in areas_to_resample:
        if area_name is None:
            # no resampling
            area_def = None
        elif area_name == 'MAX':
            area_def = scn.max_area()
        elif area_name == 'MIN':
            area_def = scn.min_area()
        elif area_name in custom_areas:
            area_def = custom_areas[area_name]
        elif area_name in grid_manager:
            from pyresample.geometry import DynamicAreaDefinition
            p2g_def = grid_manager[area_name]
            area_def = p2g_def.to_satpy_area()
            if isinstance(area_def, DynamicAreaDefinition) and p2g_def['cell_width'] is not None:
                area_def = area_def.freeze(scn.max_area(),
                                           resolution=(abs(p2g_def['cell_width']), abs(p2g_def['cell_height'])))
        else:
            area_def = get_area_def(area_name)

        if resampler is None and area_def is not None:
            rs = 'native' if area_name in ['MIN', 'MAX'] else 'nearest'
            LOG.debug("Setting default resampling to '{}' for grid '{}'".format(rs, area_name))
        else:
            rs = resampler

        if area_def is not None:
            LOG.info("Resampling data to '%s'", area_name)
            new_scn = scn.resample(area_def, resampler=rs, **resample_kwargs)
        elif not preserve_resolution:
            # the user didn't want to resample to any areas
            # the user also requested that we don't preserve resolution
            # which means we have to save this Scene's datasets
            # because they won't be saved
            new_scn = scn

        to_save = write_scene(new_scn, args.writers, writer_args, resampled_products, to_save=to_save)

    if args.progress:
        pbar = ProgressBar()
        pbar.register()

    LOG.info("Computing products and saving data to writers...")
    compute_writer_results(to_save)
    LOG.info("SUCCESS")
    return 0
    def make_climatology(ds,
                         output_frequency,
                         monthly_weights=False,
                         time_var_name='time',
                         time_dim_name='t_dim',
                         fn_out=None,
                         missing_values=False):
        '''
        Calculates a climatology for all variables in a supplied dataset.
        The resulting xarray dataset will NOT be loaded to RAM. Instead,
        it is a set of dask operations. To load to RAM use, e.g. .compute().
        However, if the original data was large, this may take a long time and
        a lot of memory. Make sure you have the available RAM or chunking
        and parallel processes are specified correctly.
        
        Otherwise, it is recommended that you access the climatology data
        in an indexed way. I.E. compute only at specific parts of the data
        are once.
        
        The resulting cliamtology dataset can be written to disk using
        .to_netcdf(). Again, this may take a while for larger datasets.
        
        ds :: xarray dataset object from a COAsT object.
        output_frequency :: any xarray groupby string. i.e:
            'month'
            'season'
        time_var_name :: the string name of the time variable in dataset
        time_dim_name :: the string name of the time dimension variable in dataset
        fn_out :: string defining full output netcdf file path and name.
        missing_values :: boolean where True indicates the data has missing values 
            that should be ignored. Missing values must be represented by NaNs.
        '''

        frequency_str = time_var_name + '.' + output_frequency
        print('Calculating climatological mean')

        if missing_values:
            ds_mean = xr.Dataset()
            for varname, da in ds.data_vars.items():
                mask = xr.where(uf.isnan(da), 0, 1)
                data = da.groupby(frequency_str).sum(dim=time_dim_name)
                N = mask.groupby(frequency_str).sum(dim=time_dim_name)
                ds_mean[varname] = data / N
        else:
            if monthly_weights:
                month_length = ds[time_var_name].dt.days_in_month
                grouped = month_length.groupby(frequency_str)
            else:
                ds['clim_mean_ones_tmp'] = (time_dim_name,
                                            np.ones(
                                                ds[time_var_name].shape[0]))
                grouped = ds['clim_mean_ones_tmp'].groupby(frequency_str)

            weights = grouped / grouped.sum()
            ds_mean = (ds *
                       weights).groupby(frequency_str).sum(dim=time_dim_name)

            if not monthly_weights:
                ds = ds.drop_vars('clim_mean_ones_tmp')

        if fn_out is not None:
            print('Saving to file. May take some time..')
            with ProgressBar():
                ds_mean.to_netcdf(fn_out)

        return ds_mean

        return
Esempio n. 24
0
def Movie(da,
          odir,
          varname=None,
          framedim='time',
          moviename='movie',
          clim=None,
          cmap=None,
          bgcolor=np.array([1, 1, 1]) * 0.3,
          framewidth=1280,
          frameheight=720,
          dpi=100,
          lon=None,
          lat=None,
          dask=True,
          delete=True,
          ffmpeg=True,
          plot_style='simple',
          norm=mpl.colors.Normalize(),
          progbar=False):
    # Set defaults:
    if not ffmpeg and delete:
        raise RuntimeError('raw picture deletion makes only \
            sense if ffmpeg conversion is enabled')

    if not isinstance(da, xr.DataArray):
        raise RuntimeError('input has to be an xarray DataStructure, instead\
        is ' + str(type(da)))

    if not os.path.exists(odir):
        os.makedirs(odir)

    # Infer defaults from data
    if clim is None:
        print('clim will be inferred from data, this can take very long...')
        clim = [da.min(), da.max()]
    if cmap is None:
        cmap = plt.cm.viridis

    if plot_style in ['map']:
        if None in [lon, lat]:
            raise RuntimeError('map plotting requires lon and lat')
        else:
            lons = np.array(da[lon].data)
            lats = np.array(da[lat].data)

            if len(lons.shape) != 2:
                lons, lats = np.meshgrid(lons, lats)

            time = np.array(da['time'].data)

    else:
        lons = None
        lats = None
        time = None

    # Annnd here we go
    print('+++ Execute plot function +++')
    if dask:
        data = da.data
        frame_axis = da.get_axis_num(framedim)
        drop_axis = [da.get_axis_num(a) for a in da.dims if not a == framedim]
        chunks = list(data.shape)
        chunks[frame_axis] = 1
        data = data.rechunk(chunks)
        if progbar:
            pbar = ProgressBar()
            pbar.register()
        data.map_blocks(FramePrint,
                        chunks=[1],
                        drop_axis=drop_axis,
                        dtype=np.float64,
                        dask=dask,
                        frame_axis=frame_axis,
                        odir=odir,
                        cmap=cmap,
                        clim=clim,
                        framewidth=framewidth,
                        frameheight=frameheight,
                        bgcolor=bgcolor,
                        plot_style=plot_style,
                        lons=lons,
                        lats=lats,
                        time=time,
                        norm=norm,
                        dpi=dpi).compute(get=get)
        if progbar:
            pbar.unregister()
    # The .compute(get=get) line is some dask 'magic': it parallelizes the
    # print function with processes and not threads,which is a lot faster
    # for custom functions apparently!
    else:
        # do it with a simple for loop...can this really be quicker?
        print('This is slow! Do it in dask!')
        for ii in range(0, len(da.time)):
            start_time = time.time()
            da_slice = da[{framedim: ii}]
            # fig,ax,h = FramePrint(da_slice,
            FramePrint(da_slice,
                       frame=ii,
                       odir=odir,
                       cmap=cmap,
                       clim=clim,
                       framewidth=framewidth,
                       frameheight=dpi,
                       bgcolor=bgcolor,
                       plot_style=plot_style,
                       lons=lons,
                       lats=lats,
                       norm=norm,
                       dpi=dpi)
            if ii % 100 == 0:
                remaining_time = (len(da.time) - ii) * \
                    (time.time() - start_time) / 60
                print('FRAME---%04d---' % ii)
                print('Estimated time left : %d minutes' % remaining_time)

    query = 'ffmpeg -y -i "frame_%05d.png" -c:v libx264 -preset veryslow \
        -crf 6 -pix_fmt yuv420p \
        -framerate 10 \
        "' + moviename + '.mp4"'

    with cd(odir):
        if ffmpeg:
            print('+++ Convert frames to video +++')
            excode = os.system(query)
            if excode == 0 and delete:
                os.system('rm *.png')
Esempio n. 25
0
def calc_simulated_energy(wind_speed, turbines, power_curve=None, sum_along='turbines',
                          capacity_scaling=True, only_built_turbines=True):
    """Estimate generated energy using wind data and turbine data.

    Parameters
    ----------
    wind_speed : xr.DataArray
        see calc_wind_speed_at_turbines()
    turbines : xr.DataSet
        see load_turbines()
    power_curve : callable
        a function mapping wind speed to power
    sum_along : str
        sum along turbines or time or emtpy string
    capacity_scaling : bool
        scale power curve to capacity for each turbine (if available)
    only_built_turbines : bool
        calculate energy only for time stamps where commission year is older

    Returns
    -------
    simulated_energy_gwh : xr.DataArray
        Simulated energy per month [GWh], dims = (time, turbines)


    FIXME this modifies the input wind_speed variable! Very dangerous but unclear if solvable
     without too much memory consumption (via copying)

    """
    if power_curve is None:
        power_curve = ge15_77.power_curve

    # this outputs a deprecation warning, see https://github.com/pydata/xarray/issues/2928
    # TODO probably not the best idea to have this here, since it modifies gloabl behavior at
    #  runtime, but where else to put it? pytest ignores warnings.catch_warnings()...
    warnings.filterwarnings('ignore', 'The da.atop function has moved to da.blockwise')

    # TODO this is a bit scary, when does parallelized not work? Which dtype?
    simulated_energy = xr.apply_ufunc(power_curve, wind_speed,
                                      dask='parallelized',
                                      output_dtypes=[np.float64])

    simulated_energy = simulated_energy.assign_coords(turbines=turbines.turbines)

    if only_built_turbines:
        # TODO all turbines where year = NaN will be removed that way... :-/
        # this is the beginning of the year the turbine has been commissioned
        building_dates = turbines.p_year.astype(int).astype(str).astype(np.datetime64)

        nanosecs_of_year = (simulated_energy.time - building_dates).astype(np.float)
        proportion_of_year = nanosecs_of_year / (365.25 * 24 * 60 * 60 * 1e9)

        # comparing objects with dim "time" and dim "turbines" results in (time, turbines)
        building_this_year = simulated_energy.time.dt.year == turbines.p_year

        simulated_energy = simulated_energy.where(~building_this_year,
                                                  simulated_energy * proportion_of_year)

        already_built = simulated_energy.time.dt.year >= turbines.p_year
        simulated_energy = simulated_energy.where(already_built, 0)

        # Uargh... there is a weired memory leak somewhere, this seems to help a bit at least... :-/
        del nanosecs_of_year
        del proportion_of_year
        del building_this_year
        del building_dates
        del already_built

    if capacity_scaling:
        # FIXME this should use turbine_model.capacity_mw not 1500!
        simulated_energy *= (turbines.t_cap / 1500.).fillna(1.)

    # inspired by:
    # http://xarray.pydata.org/en/stable/examples/weather-data.html#monthly-averaging

    simulated_energy = simulated_energy.sortby('time') * 1e-6

    if sum_along:
        simulated_energy = simulated_energy.sum(dim=sum_along)
    if sum_along == 'turbines':
        simulated_energy = simulated_energy.resample(time='1MS').sum()

    # Does not work for multiple years:
    # simulated_energy = simulated_energy.sum(dim='turbines').groupby('time.month').sum() * 1e-6

    with ProgressBar():
        simulated_energy_gwh = simulated_energy.compute()

    if sum_along == 'turbines':
        simulated_energy_gwh.name = "Simulated energy per month [GWh]"
    elif sum_along == 'time':
        simulated_energy_gwh.name = "Simulated energy"  # TODO unit depends on time range?

    return simulated_energy_gwh
Esempio n. 26
0
def dask_linear_operator(self):
    self.nC = self.modelMap.shape[0]

    n_data_comp = len(self.survey.components)
    components = np.array(list(self.survey.components.keys()))
    active_components = np.hstack(
        [np.c_[values] for values in self.survey.components.values()]
    ).tolist()

    row = delayed(self.evaluate_integral, pure=True)
    rows = [
        array.from_delayed(
            row(receiver_location, components[component]),
            dtype=np.float32,
            shape=(n_data_comp, self.nC),
        )
        for receiver_location, component in zip(
            self.survey.receiver_locations.tolist(), active_components
        )
    ]
    stack = array.vstack(rows)

    # Chunking options
    if self.chunk_format == "row" or self.store_sensitivities == "forward_only":
        config.set({"array.chunk-size": f"{self.max_chunk_size}MiB"})
        # Autochunking by rows is faster and more memory efficient for
        # very large problems sensitivty and forward calculations
        stack = stack.rechunk({0: "auto", 1: -1})

    elif self.chunk_format == "equal":
        # Manual chunks for equal number of blocks along rows and columns.
        # Optimal for Jvec and Jtvec operations
        row_chunk, col_chunk = compute_chunk_sizes(*stack.shape, self.max_chunk_size)
        stack = stack.rechunk((row_chunk, col_chunk))
    else:
        # Auto chunking by columns is faster for Inversions
        config.set({"array.chunk-size": f"{self.max_chunk_size}MiB"})
        stack = stack.rechunk({0: -1, 1: "auto"})

    if self.store_sensitivities == "disk":
        sens_name = self.sensitivity_path + "sensitivity.zarr"
        if os.path.exists(sens_name):
            kernel = array.from_zarr(sens_name)
            if np.all(
                np.r_[
                    np.any(np.r_[kernel.chunks[0]] == stack.chunks[0]),
                    np.any(np.r_[kernel.chunks[1]] == stack.chunks[1]),
                    np.r_[kernel.shape] == np.r_[stack.shape],
                ]
            ):
                # Check that loaded kernel matches supplied data and mesh
                print("Zarr file detected with same shape and chunksize ... re-loading")
                return kernel
        else:
            print("Writing Zarr file to disk")
            with ProgressBar():
                print("Saving kernel to zarr: " + sens_name)
                kernel = array.to_zarr(
                    stack, sens_name, compute=True, return_stored=True, overwrite=True
                )
    elif self.store_sensitivities == "forward_only":
        with ProgressBar():
            print("Forward calculation: ")
            pred = (stack @ self.model).compute()
        return pred
    else:
        print(stack.chunks)
        with ProgressBar():
            print("Computing sensitivities to local ram")
            kernel = array.asarray(stack.compute())
    return kernel
import argparse
import json
import os
import hashlib
import pathlib
from tabulate import tabulate
import format.peek as sspk
import format.split_column as sssp
import format.tab_man_gui as tmg
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import pandas as pd
from format.utils import header_mapper

pbar = ProgressBar()
pbar.register()


class Table():
    def __init__(self, file, outfile_prefix, field_sep, remove_starting):
        self.file = file
        self.outfile_prefix = outfile_prefix
        self.field_sep = field_sep
        self.ignore_pattern = remove_starting
        self.field_names = []

    def get_extension(self):
        self.file_extension = "".join(pathlib.Path(self.file).suffixes)

    def get_filename(self):
        self.get_extension()
Esempio n. 28
0
def new(ms, sky_model, gains, **kwargs):
    """Generate model visibilties per source (as direction axis)
    for stokes I and Q and generate relevant visibilities."""

    # Options to attributed dictionary
    if kwargs["yaml"] is not None:
        options = ocf.load(kwargs["yaml"])
    else:
        options = ocf.create(kwargs)

    # Set to struct
    ocf.set_struct(options, True)

    # Change path to sky model if chosen
    try:
        sky_model = sky_models[sky_model.lower()]
    except:
        # Own sky model reference
        pass

    # Set thread count to cpu count
    if options.ncpu:
        from multiprocessing.pool import ThreadPool
        import dask
        dask.config.set(pool=ThreadPool(options.ncpu))
    else:
        import multiprocessing
        options.ncpu = multiprocessing.cpu_count()

    # Load gains to corrupt with
    with open(gains, "rb") as file:
        jones = np.load(file)

    # Load dimensions
    n_time, n_ant, n_chan, n_dir, n_corr = jones.shape
    n_row = n_time * (n_ant * (n_ant - 1) // 2)

    # Load ms
    MS = xds_from_ms(ms)[0]

    # Get time-bin indices and counts
    row_chunks, tbin_indices, tbin_counts = chunkify_rows(
        MS.TIME, options.utime)

    # Close and reopen with chunked rows
    MS.close()
    MS = xds_from_ms(ms, chunks={"row": row_chunks})[0]

    # Get antenna arrays (dask ignored for now)
    ant1 = MS.ANTENNA1.data
    ant2 = MS.ANTENNA2.data

    # Adjust UVW based on phase-convention
    if options.phase_convention.upper() == 'CASA':
        uvw = -MS.UVW.data.astype(np.float64)
    elif options.phase_convention.upper() == 'CODEX':
        uvw = MS.UVW.data.astype(np.float64)
    else:
        raise ValueError("Unknown sign convention for phase.")

    # MS dimensions
    dims = ocf.create(dict(MS.sizes))

    # Close MS
    MS.close()

    # Build source model from lsm
    lsm = Tigger.load(sky_model)

    # Check if dimensions match jones
    assert n_time * (n_ant * (n_ant - 1) // 2) == dims.row
    assert n_time == len(tbin_indices)
    assert n_ant == np.max((np.max(ant1), np.max(ant2))) + 1
    assert n_chan == dims.chan
    assert n_corr == dims.corr

    # If gains are DIE
    if options.die:
        assert n_dir == 1
        n_dir = len(lsm.sources)
    else:
        assert n_dir == len(lsm.sources)

    # Get phase direction
    radec0_table = xds_from_table(ms + '::FIELD')[0]
    radec0 = radec0_table.PHASE_DIR.data.squeeze().compute()
    radec0_table.close()

    # Get frequency column
    freq_table = xds_from_table(ms + '::SPECTRAL_WINDOW')[0]
    freq = freq_table.CHAN_FREQ.data.astype(np.float64)[0]
    freq_table.close()

    # Get feed orientation
    feed_table = xds_from_table(ms + '::FEED')[0]
    feeds = feed_table.POLARIZATION_TYPE.data[0].compute()

    # Create initial model array
    model = np.zeros((n_dir, n_chan, n_corr), dtype=np.float64)

    # Create initial coordinate array and source names
    lm = np.zeros((n_dir, 2), dtype=np.float64)
    source_names = []

    # Cycle coordinates creating a source with flux
    print("==> Building model visibilities")
    for d, source in enumerate(lsm.sources):
        # Extract name
        source_names.append(source.name)

        # Extract position
        radec_s = np.array([[source.pos.ra, source.pos.dec]])
        lm[d] = radec_to_lm(radec_s, radec0)

        # Get flux - Stokes I
        if source.flux.I:
            I0 = source.flux.I

            # Get spectrum (only spi currently supported)
            tmp_spec = source.spectrum
            spi = [tmp_spec.spi if tmp_spec is not None else 0.0]
            ref_freq = [tmp_spec.freq0 if tmp_spec is not None else 1.0]

            # Generate model flux
            model[d, :, 0] = I0 * (freq / ref_freq)**spi

        # Get flux - Stokes Q
        if source.flux.Q:
            Q0 = source.flux.Q

            # Get spectrum
            tmp_spec = source.spectrum
            spi = [tmp_spec.spi if tmp_spec is not None else 0.0]
            ref_freq = [tmp_spec.freq0 if tmp_spec is not None else 1.0]

            # Generate model flux
            model[d, :, 1] = Q0 * (freq / ref_freq)**spi

        # Get flux - Stokes U
        if source.flux.U:
            U0 = source.flux.U

            # Get spectrum
            tmp_spec = source.spectrum
            spi = [tmp_spec.spi if tmp_spec is not None else 0.0]
            ref_freq = [tmp_spec.freq0 if tmp_spec is not None else 1.0]

            # Generate model flux
            model[d, :, 2] = U0 * (freq / ref_freq)**spi

        # Get flux - Stokes V
        if source.flux.V:
            V0 = source.flux.V

            # Get spectrum
            tmp_spec = source.spectrum
            spi = [tmp_spec.spi if tmp_spec is not None else 0.0]
            ref_freq = [tmp_spec.freq0 if tmp_spec is not None else 1.0]

            # Generate model flux
            model[d, :, 3] = V0 * (freq / ref_freq)**spi

    # Close sky-model
    del lsm

    # Build dask graph
    tbin_indices = da.from_array(tbin_indices, chunks=(options.utime))
    tbin_counts = da.from_array(tbin_counts, chunks=(options.utime))
    lm = da.from_array(lm, chunks=lm.shape)
    model = da.from_array(model, chunks=model.shape)
    jones = da.from_array(jones, chunks=(options.utime, ) + jones.shape[1::])

    # Apply image to visibility for each source
    sources = []
    for s in range(n_dir):
        source_vis = im_to_vis(model[s].reshape((1, n_chan, n_corr)),
                               uvw,
                               lm[s].reshape((1, 2)),
                               freq,
                               dtype=np.complex64,
                               convention='fourier')

        sources.append(source_vis)
    model_vis = da.stack(sources, axis=2)

    # Sum over direction?
    if options.die:
        model_vis = da.sum(model_vis, axis=2, keepdims=True)
        n_dir = 1
        source_names = [options.mname]

    # Select schema based on feed orientation
    if (feeds == ["X", "Y"]).all():
        out_schema = [["XX", "XY"], ["YX", "YY"]]
    elif (feeds == ["R", "L"]).all():
        out_schema = [['RR', 'RL'], ['LR', 'LL']]
    else:
        raise ValueError("Unknown feed orientation implementation.")

    # Convert Stokes to Correlations
    in_schema = ['I', 'Q', 'U', 'V']
    model_vis = convert(model_vis, in_schema, out_schema).reshape(
        (n_row, n_chan, n_dir, n_corr))

    # Apply gains to model_vis
    print("==> Corrupting visibilities")

    data = corrupt_vis(tbin_indices, tbin_counts, ant1, ant2, jones, model_vis)

    # Reopen MS
    MS = xds_from_ms(ms, chunks={"row": row_chunks})[0]

    # Assign model visibilities
    out_names = []
    for d in range(n_dir):
        MS = MS.assign(
            **{
                source_names[d]: (("row", "chan", "corr"),
                                  model_vis[:, :, d].astype(np.complex64))
            })

        out_names += [source_names[d]]

    # Assign noise free visibilities to 'CLEAN_DATA'
    MS = MS.assign(
        **{
            'CLEAN_' + options.dname: (("row", "chan", "corr"),
                                       data.astype(np.complex64))
        })

    out_names += ['CLEAN_' + options.dname]

    # Get noise realisation
    if options.std > 0.0:

        # Noise matrix
        print(f"==> Applying noise (std={options.std}) to visibilities")
        noise = []
        for i in range(2):
            real = da.random.normal(loc=0.0,
                                    scale=options.std,
                                    size=(n_row, n_chan),
                                    chunks=(row_chunks, n_chan))
            imag = 1.0j * (da.random.normal(loc=0.0,
                                            scale=options.std,
                                            size=(n_row, n_chan),
                                            chunks=(row_chunks, n_chan)))
            noise.append(real + imag)

        # Zero matrix for off-diagonals
        zero = da.zeros((n_row, n_chan), chunks=(row_chunks, n_chan))

        noise.insert(1, zero)
        noise.insert(2, zero)

        # NP to Dask
        noise = da.stack(noise, axis=2).rechunk((row_chunks, n_chan, n_corr))

        # Assign noise to 'NOISE'
        MS = MS.assign(
            **{'NOISE': (("row", "chan", "corr"), noise.astype(np.complex64))})

        out_names += ['NOISE']

        # Add noise to data and assign to 'DATA'
        noisy_data = data + noise

        MS = MS.assign(
            **{
                options.dname: (("row", "chan", "corr"),
                                noisy_data.astype(np.complex64))
            })

        out_names += [options.dname]

    # Create a write to the table
    write = xds_to_table(MS, ms, out_names)

    # Submit all graph computations in parallel
    print(f"==> Executing `dask-ms` write to `{ms}` for the following columns: "\
            + f"{', '.join(out_names)}")

    with ProgressBar():
        write.compute()

    print(f"==> Completed.")
Esempio n. 29
0
df_pros = pd.read_csv(filepath, header=0, sep='|', quoting=3, dtype='str', encoding='utf-8', na_values=([' ',''])).fillna('')

#annonces = dd.read_csv(r'D:\25. Requests\IMMO_FR processing script\ANNONCES_2020_08.csv', header=0, sep='|', quoting=3, dtype='object', encoding='utf-8', na_values=([' ','']))
#df_pros = pd.read_csv(r'D:\25. Requests\IMMO_FR processing script\PRO_2020_08.csv', header=0, sep='|', quoting=3, dtype='str', encoding='utf-8', na_values=([' ',''])).fillna('')

start = timer()

print("Start exporting")
#EXPORT PERCENTAGE OF NULL/MISSING VALUES PER WEBSITES
print("Export: emptiness_per_website.xlsx")
column_list = annonces.columns.tolist()

websites = annonces['SITE_ANNONCE'].unique().compute().tolist()
print("There are {} websites".format(len(websites)))
pct_missing_per_website = pd.DataFrame()
with ProgressBar():
    for website in websites:
        print(website)
        temp_df = annonces[annonces['SITE_ANNONCE']==website].compute()
        temp_null_count = temp_df.isnull().sum()
        temp_site_count = len(temp_df.index)
        pct_missing_per_website[website] = round(temp_null_count/temp_site_count*100,2)
pct_missing_per_website.to_excel(os.path.join(save_path ,'emptiness_per_website.xlsx'))

#EXPORT PERCENTAGE OF TOTAL NULL/MISSING VALUES
print("Export: emptiness_total.xlsx")
with ProgressBar():
    null_count_total = annonces.isnull().sum().compute()
    total_length = len(annonces.index)
    null_percent_total = round(null_count_total/len(annonces.index),2)
total_missing = pd.concat([null_count_total, null_percent_total], axis = 1)
Esempio n. 30
0
def main(argv=sys.argv[1:]):
    global LOG

    import satpy
    from satpy import Scene
    from satpy.writers import compute_writer_results
    from dask.diagnostics import ProgressBar
    from polar2grid.core.script_utils import (setup_logging, rename_log_file,
                                              create_exc_handler)
    import argparse

    dist = pkg_resources.get_distribution('polar2grid')
    if dist_is_editable(dist):
        p2g_etc = os.path.join(dist.module_path, 'etc')
    else:
        p2g_etc = os.path.join(sys.prefix, 'etc', 'polar2grid')
    config_path = satpy.config.get('config_path')
    if p2g_etc not in config_path:
        satpy.config.set(config_path=config_path + [p2g_etc])

    USE_POLAR2GRID_DEFAULTS = bool(
        int(os.environ.setdefault("USE_POLAR2GRID_DEFAULTS", "1")))

    prog = os.getenv('PROG_NAME', sys.argv[0])
    # "usage: " will be printed at the top of this:
    usage = """
    %(prog)s -h
see available products:
    %(prog)s -r <reader> -w <writer> --list-products -f file1 [file2 ...]
basic processing:
    %(prog)s -r <reader> -w <writer> [options] -f file1 [file2 ...]
basic processing with limited products:
    %(prog)s -r <reader> -w <writer> [options] -p prod1 prod2 -f file1 [file2 ...]
"""
    parser = argparse.ArgumentParser(
        prog=prog,
        usage=usage,
        fromfile_prefix_chars="@",
        description="Load, composite, resample, and save datasets.")
    parser.add_argument(
        '-v',
        '--verbose',
        dest='verbosity',
        action="count",
        default=0,
        help='each occurrence increases verbosity 1 level through '
        'ERROR-WARNING-INFO-DEBUG (default INFO)')
    parser.add_argument('-l',
                        '--log',
                        dest="log_fn",
                        default=None,
                        help="specify the log filename")
    parser.add_argument(
        '--progress',
        action='store_true',
        help="show processing progress bar (not recommended for logged output)"
    )
    parser.add_argument(
        '--num-workers',
        type=int,
        default=os.getenv('DASK_NUM_WORKERS', 4),
        help="specify number of worker threads to use (default: 4)")
    parser.add_argument(
        '--match-resolution',
        dest='preserve_resolution',
        action='store_false',
        help="When using the 'native' resampler for composites, don't save data "
        "at its native resolution, use the resolution used to create the "
        "composite.")
    parser.add_argument("--list-products",
                        dest="list_products",
                        action="store_true",
                        help="List available reader products and exit")
    reader_group = add_scene_argument_groups(
        parser, is_polar2grid=USE_POLAR2GRID_DEFAULTS)[0]
    resampling_group = add_resample_argument_groups(
        parser, is_polar2grid=USE_POLAR2GRID_DEFAULTS)[0]
    writer_group = add_writer_argument_groups(parser)[0]
    subgroups = [reader_group, resampling_group, writer_group]

    argv_without_help = [x for x in argv if x not in ["-h", "--help"]]

    _retitle_optional_arguments(parser)
    args, remaining_args = parser.parse_known_args(argv_without_help)
    os.environ['DASK_NUM_WORKERS'] = str(args.num_workers)

    # get the logger if we know the readers and writers that will be used
    if args.readers is not None and args.writers is not None:
        glue_name = args.readers[0] + "_" + "-".join(args.writers or [])
        LOG = logging.getLogger(glue_name)
    # add writer arguments
    for writer in (args.writers or []):
        parser_func = WRITER_PARSER_FUNCTIONS.get(writer)
        if parser_func is None:
            continue
        subgroups += parser_func(parser)
    args = parser.parse_args(argv)

    if args.readers is None:
        parser.print_usage()
        parser.exit(
            1, "\nERROR: Reader must be provided (-r flag).\n"
            "Supported readers:\n\t{}\n".format('\n\t'.join(
                ['abi_l1b', 'ahi_hsd', 'hrit_ahi'])))
    elif len(args.readers) > 1:
        parser.print_usage()
        parser.exit(
            1, "\nMultiple readers is not currently supported. Got:\n\t"
            "{}\n".format('\n\t'.join(args.readers)))
        return -1
    if args.writers is None:
        parser.print_usage()
        parser.exit(
            1,
            "\nERROR: Writer must be provided (-w flag) with one or more writer.\n"
            "Supported writers:\n\t{}\n".format('\n\t'.join(['geotiff'])))

    def _args_to_dict(group_actions, exclude=None):
        if exclude is None:
            exclude = []
        return {
            ga.dest: getattr(args, ga.dest)
            for ga in group_actions
            if hasattr(args, ga.dest) and ga.dest not in exclude
        }

    reader_args = _args_to_dict(reader_group._group_actions)
    reader_names = reader_args.pop('readers')
    scene_creation = {
        'filenames': reader_args.pop('filenames'),
        'reader': reader_names[0],
    }
    load_args = {
        'products': reader_args.pop('products'),
    }
    # anything left in 'reader_args' is a reader-specific kwarg
    resample_args = _args_to_dict(resampling_group._group_actions)
    writer_args = _args_to_dict(writer_group._group_actions)
    # writer_args = {}
    subgroup_idx = 3
    for idx, writer in enumerate(writer_args['writers']):
        sgrp1, sgrp2 = subgroups[subgroup_idx + idx * 2:subgroup_idx + 2 +
                                 idx * 2]
        wargs = _args_to_dict(sgrp1._group_actions)
        if sgrp2 is not None:
            wargs.update(_args_to_dict(sgrp2._group_actions))
        writer_args[writer] = wargs
        # get default output filename
        if 'filename' in wargs and wargs['filename'] is None:
            wargs['filename'] = get_default_output_filename(
                args.readers[0], writer)

    if not args.filenames:
        parser.print_usage()
        parser.exit(1, "\nERROR: No data files provided (-f flag)\n")

    # Prepare logging
    rename_log = False
    if args.log_fn is None:
        rename_log = True
        args.log_fn = glue_name + "_fail.log"
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    setup_logging(console_level=levels[min(3, args.verbosity)],
                  log_filename=args.log_fn)
    logging.getLogger('rasterio').setLevel(levels[min(2, args.verbosity)])
    sys.excepthook = create_exc_handler(LOG.name)
    if levels[min(3, args.verbosity)] > logging.DEBUG:
        import warnings
        warnings.filterwarnings("ignore")
    LOG.debug("Starting script with arguments: %s", " ".join(sys.argv))

    # Set up dask and the number of workers
    if args.num_workers:
        dask.config.set(num_workers=args.num_workers)

    # Parse provided files and search for files if provided directories
    scene_creation['filenames'] = get_input_files(scene_creation['filenames'])
    # Create a Scene, analyze the provided files
    LOG.info("Sorting and reading input files...")
    try:
        scn = Scene(**scene_creation)
    except ValueError as e:
        LOG.error(
            "{} | Enable debug message (-vvv) or see log file for details.".
            format(str(e)))
        LOG.debug("Further error information: ", exc_info=True)
        return -1
    except OSError:
        LOG.error(
            "Could not open files. Enable debug message (-vvv) or see log file for details."
        )
        LOG.debug("Further error information: ", exc_info=True)
        return -1

    if args.list_products:
        print("\n".join(sorted(scn.available_dataset_names(composites=True))))
        return 0

    # Rename the log file
    if rename_log:
        rename_log_file(glue_name +
                        scn.attrs['start_time'].strftime("_%Y%m%d_%H%M%S.log"))

    # Load the actual data arrays and metadata (lazy loaded as dask arrays)
    LOG.info("Loading product metadata from files...")
    load_args['products'] = _apply_default_products_and_aliases(
        scn, scene_creation['reader'], load_args['products'])
    if not load_args['products']:
        return -1
    scn.load(load_args['products'])

    ll_bbox = resample_args.pop('ll_bbox')
    if ll_bbox:
        scn = scn.crop(ll_bbox=ll_bbox)

    scn = filter_scene(
        scn,
        reader_names,
        sza_threshold=reader_args['sza_threshold'],
        day_fraction=reader_args['filter_day_products'],
        night_fraction=reader_args['filter_night_products'],
    )
    if scn is None:
        LOG.info("No remaining products after filtering.")
        return 0

    to_save = []
    areas_to_resample = resample_args.pop("grids")
    if 'ewa_persist' in resample_args:
        resample_args['persist'] = resample_args.pop('ewa_persist')
    scenes_to_save = resample_scene(
        scn,
        areas_to_resample,
        preserve_resolution=args.preserve_resolution,
        is_polar2grid=USE_POLAR2GRID_DEFAULTS,
        **resample_args)
    for scene_to_save, products_to_save in scenes_to_save:
        overwrite_platform_name_with_aliases(scene_to_save)
        to_save = write_scene(scene_to_save,
                              writer_args['writers'],
                              writer_args,
                              products_to_save,
                              to_save=to_save)

    if args.progress:
        pbar = ProgressBar()
        pbar.register()

    LOG.info("Computing products and saving data to writers...")
    compute_writer_results(to_save)
    LOG.info("SUCCESS")
    return 0