def to_points_loop_wg10(loc_id, points, fname, start_year, end_year, djf=False): from dask.diagnostics import ProgressBar import gc ProgressBar().register() dates = [] if djf: for y in np.arange(start_year, end_year + 1): for m in [1, 2, 12]: dates.append(dt.datetime(y, m, 1, 0, 0, 0)) else: for y in np.arange(start_year, end_year + 1): for m in np.arange(1, 13): dates.append(dt.datetime(y, m, 1, 0, 0, 0)) df = pd.DataFrame() lsm = xr.open_dataset( "/g/data/ma05/BARRA_AD/v1/static/lnd_mask-fc-slv-PT0H-BARRA_AD-v1.nc") #Read netcdf data for t in np.arange(len(dates)): print(dates[t]) year = dt.datetime.strftime(dates[t], "%Y") month = dt.datetime.strftime(dates[t], "%m") f = xr.open_mfdataset("/g/data/ma05/BARRA_AD/v1/forecast/spec/max_max_wndgust10m/"+\ year+"/"+month+"/*.sub.nc", concat_dim="time") #Setup lsm lat = f.coords.get("latitude").values lon = f.coords.get("longitude").values x, y = np.meshgrid(lon, lat) x[lsm.lnd_mask == 0] = np.nan y[lsm.lnd_mask == 0] = np.nan dist_lon = [] dist_lat = [] for i in np.arange(len(loc_id)): dist = np.sqrt(np.square(x-points[i][0]) + \ np.square(y-points[i][1])) temp_lat, temp_lon = np.unravel_index(np.nanargmin(dist), dist.shape) dist_lon.append(temp_lon) dist_lat.append(temp_lat) temp_df = f["max_max_wndgust10m"].isel(latitude = xr.DataArray(dist_lat, dims="points"), \ longitude = xr.DataArray(dist_lon, dims="points")).persist().to_dataframe() temp_df = temp_df.reset_index() for p in np.arange(len(loc_id)): temp_df.loc[temp_df.points == p, "loc_id"] = loc_id[p] temp_df = temp_df.drop(["points",\ "forecast_period", "forecast_reference_time"],axis=1) df = pd.concat([df, temp_df]) f.close() gc.collect() df.sort_values([ "loc_id", "time" ]).to_pickle("/g/data/eg3/ab4502/ExtremeWind/points/" + fname + ".pkl")
import argparse import datetime import glob import math import numpy as np import os import pandas as pd import rasterio import seaborn as sns import xarray as xr from matplotlib import pyplot as plt from dask.diagnostics import ProgressBar ProgressBar().register() from paths_usa import * parser = argparse.ArgumentParser(description='Insert optionally GWA') parser.add_argument('-GWA') args = parser.parse_args() if (args.GWA == None): GWA = "3" else: GWA = args.GWA if GWA == "2": results_pathg = results_path + '/results_GWA2' else: results_pathg = results_path
def estimate_shifts_old(mn_list, temp_list, z_thres=None, rm_background=False, method='first', concat_dim='session'): temps = [] for imn, mn_path in enumerate(mn_list): print("loading template: {:2d}/{:2d}".format(imn, len(mn_list))) try: with xr.open_dataset(mn_path, chunks=dict(width='auto', height='auto'))['org'] as cur_va: if temp_list[imn] == 'first': cur_temp = cur_va.isel(frame=0).load().copy() elif temp_list[imn] == 'last': cur_temp = cur_va.isel(frame=-1).load().copy() elif temp_list[imn] == 'mean': cur_temp = (cur_va.mean('frame')) with ProgressBar(): cur_temp = cur_temp.compute() else: print("unrecognized template") continue if rm_background: cur_temp = remove_background(cur_temp, 'uniform', wnd=51) temps.append(cur_temp) except KeyError: print("no video found for path {}".format(mn_path)) if concat_dim: temps = xr.concat(temps, dim=concat_dim).rename('temps') window = ~temps.isnull().sum(concat_dim).astype(bool) temps = temps.where(window, drop=True) shifts = [] corrs = [] for itemp, temp_dst in temps.rolling(**{concat_dim: 1}): print("processing: {}".format(itemp.values)) if method == 'first': temp_src = temps.isel(**{concat_dim: 0}) elif method == 'last': temp_src = temps.isel(**{concat_dim: -1}) # common = (temp_src.isnull() + temp_dst.isnull()) # temp_src = temp_src.reindex_like(common) # temp_dst = temp_dst.reindex_like(common) temp_src, temp_dst = temp_src.squeeze(), temp_dst.squeeze() src_fft = np.fft.fft2(temp_src) dst_fft = np.fft.fft2(temp_dst) cur_res = shift_fft(src_fft, dst_fft) cur_sh = cur_res[0:2] cur_cor = cur_res[2] cur_anm = temp_dst.coords['animal'] cur_ss = temp_dst.coords['session'] cur_ssid = temp_dst.coords['session_id'] cur_sh = xr.DataArray(cur_sh, coords=dict(shift_dim=list(temp_dst.dims)), dims=['shift_dim']) cur_cor = xr.DataArray(cur_cor) cur_sh = cur_sh.assign_coords(animal=cur_anm, session=cur_ss, session_id=cur_ssid) cur_cor = cur_cor.assign_coords(animal=cur_anm, session=cur_ss, session_id=cur_ssid) shifts.append(cur_sh) corrs.append(cur_cor) if concat_dim: shifts = xr.concat(shifts, dim=concat_dim).rename('shifts') corrs = xr.concat(corrs, dim=concat_dim).rename('corrs') temps = xr.concat(temps, dim=concat_dim).rename('temps') return shifts, corrs, temps
def _mod_mean(self): if type(self.fname) is str: if self._var == None: try: varbl = self.fname.split('/')[-1].split('_')[0] except: varbl = 'Unknown' else: varbl = self._var data = xr.open_mfdataset(self.fname)[varbl] else: data = self.fname varbl = self._var if self.modMean != None: ds = data.mean(dim='ens') name_string = 'modMean' if self.zonMean != None: ds = data.mean(dim=data.dims[-1]) name_string = 'zonMean' elif self.modStd != None: ds = data.std(dim='ens') name_string = 'modStd' elif self.monClim != None: ds = data.groupby('time.month').mean('time') name_string = 'monClim' elif self.monAnom != None: climatology = data.groupby('time.month').mean('time') ds = data.groupby('time.month') - climatology name_string = 'monAnom' elif self.modAnom != None: name_string = 'modAnom' if self.init != None or self.end != None: sub = data.sel(time=slice(str(self.init), str(self.end))).mean( dim='time') ds = data - sub else: ds = data - data.mean(dim='time') elif self.trend != None: trend = trend_calc(data, int(self.init), int(self.end), ci=float(self.ci)) t = xr.DataArray(trend[0], dims=['lat', 'lon'], coords={ 'lat': data.lat, 'lon': data.lon }) t.name = 'trend' s = xr.DataArray(trend[1], dims=['lat', 'lon'], coords={ 'lat': data.lat, 'lon': data.lon }) s.name = 'sig' ds = xr.merge([t, s]) name_string = 'trend' elif self.aggr != None: ds = get_aggr(data, int(self.init), int(self.end), ci=float(self.ci)) ds.name = 'model_agreement' name_string = 'model_agreement' if self.nc == 'yes': with ProgressBar(): if self.out != None: ds.load().to_netcdf(self.out) else: try: ds.load().to_netcdf( self.fname.split('.nc')[0] + '_' + name_string + '.nc') except: ds.load().to_netcdf(name_string + '.nc') else: return ds
def predict(args): # Convert source data into dask arrays sky_model = parse_sky_model(args.sky_model, args.model_chunks) # Get the support tables tables = support_tables(args) ant_ds = tables["ANTENNA"] field_ds = tables["FIELD"] ddid_ds = tables["DATA_DESCRIPTION"] spw_ds = tables["SPECTRAL_WINDOW"] pol_ds = tables["POLARIZATION"] # List of write operations writes = [] # Construct a graph for each DATA_DESC_ID for xds in xds_from_ms( args.ms, columns=["UVW", "ANTENNA1", "ANTENNA2", "TIME"], group_cols=["FIELD_ID", "DATA_DESC_ID"], chunks={"row": args.row_chunks}, ): # Perform subtable joins ant = ant_ds[0] field = field_ds[xds.attrs["FIELD_ID"]] ddid = ddid_ds[xds.attrs["DATA_DESC_ID"]] spw = spw_ds[ddid.SPECTRAL_WINDOW_ID.data[0]] pol = pol_ds[ddid.POLARIZATION_ID.data[0]] # Select single dataset row out corrs = pol.NUM_CORR.data[0] # Generate visibility expressions for each source type source_vis = [ vis_factory(args, stype, sky_model, xds, ant, field, spw, pol) for stype in sky_model.keys() ] # Sum visibilities together vis = sum(source_vis) # Reshape (2, 2) correlation to shape (4,) if corrs == 4: vis = vis.reshape(vis.shape[:2] + (4, )) # Assign visibilities to MODEL_DATA array on the dataset xds = (xds.assign(MODEL_DATA=(("row", "chan", "corr"), vis)) if args.data_column == "MODEL_DATA" else xds.assign(CORRECTED_DATA=(("row", "chan", "corr"), vis))) # Create a write to the table write = xds_to_table(xds, args.ms, [args.data_column]) # Add to the list of writes writes.append(write) # Submit all graph computations in parallel with ProgressBar(): da.compute(writes)
def is_valid_set_of_spect_files(spect_paths, spect_format, freqbins_key='f', timebins_key='t', spect_key='s', n_decimals_trunc=5, logger=None): """validate a set of spectrogram files that will be used as a dataset. Validates that: - all files contain a spectrogram array that can be accessed with the specified key - the length of the frequency bin array in each file equals the number of rows in the spectrogram array - the frequency bins are the same across all files - the length of the time bin array in each file equals the number of columns in the spectrogram array - the duration of a spectrogram time bin is the same across all files Parameters ---------- spect_paths: list of strings or pathlib.Path objects; paths to spectrogram files. spect_format : str format of files containing spectrograms. One of {'mat', 'npz'} freqbins_key : str key for accessing vector of frequency bins in files. Default is 'f'. timebins_key : str key for accessing vector of time bins in files. Default is 't'. spect_key : str key for accessing spectrogram in files. Default is 's'. n_decimals_trunc : int number of decimal places to keep when truncating the timebin duration calculated from the vector of time bins. Default is 3, i.e. assumes milliseconds is the last significant digit. Other Parameters ---------------- logger : logging.Logger instance created by vak.logging.get_logger. Default is None. Returns ------- returns True if all validation checks pass. If not, an error is raised. """ spect_paths = [Path(spect_path) for spect_path in spect_paths] def _validate(spect_path): """validates each spectrogram file, then returns frequency bin array and duration of time bins, so that those can be validated across all files""" spect_dict = load(spect_path, spect_format) if spect_key not in spect_dict: raise KeyError( f"Did not find a spectrogram in file '{spect_path.name}' " f"using spect_key '{spect_key}'.") freq_bins = spect_dict[freqbins_key] time_bins = spect_dict[timebins_key] timebin_dur = timebin_dur_from_vec(time_bins, n_decimals_trunc) # number of freq. bins should equal number of rows if spect_dict[freqbins_key].shape[-1] != spect_dict[spect_key].shape[0]: raise ValueError(f'length of frequency bins in {spect_path.name} ' 'does not match number of rows in spectrogram') # number of time bins should equal number of columns if spect_dict[timebins_key].shape[-1] != spect_dict[spect_key].shape[1]: raise ValueError( f'length of time_bins in {spect_path.name} ' f'does not match number of columns in spectrogram') return spect_path, freq_bins, timebin_dur spect_paths_bag = db.from_sequence(spect_paths) log_or_print('validating set of spectrogram files', logger=logger, level='info') with ProgressBar(): path_freqbins_timebin_dur_tups = list(spect_paths_bag.map(_validate)) all_freq_bins = np.stack( [tup[1] for tup in path_freqbins_timebin_dur_tups]) uniq_freq_bins = np.unique(all_freq_bins, axis=0) if len(uniq_freq_bins) != 1: raise ValueError( f'Found more than one frequency bin vector across files. ' f'Instead found {len(uniq_freq_bins)}') timebin_durs = [tup[2] for tup in path_freqbins_timebin_dur_tups] uniq_durs = np.unique(timebin_durs) if len(uniq_durs) != 1: raise ValueError( 'Found more than one duration for time bins across spectrogram files. ' f'Durations found were: {uniq_durs}') return True
def run_experiment(show_plot=True): if platform not in ['win32', 'win64']: raise Exception("Rectifier.fmu is only available for Windows") print("Parameter variation on %s:" % fmu_filename) print(" VAC", v_ac) print(" IDC", i_dc) if sync: dask.set_options( get=dask.dask.local.get_sync) # synchronized scheduler # download the FMU download_test_file('2.0', 'CoSimulation', 'Dymola', '2017', 'Rectifier', fmu_filename) # read the model description model_description = read_model_description(fmu_filename) # collect the value references for the variables to read / write vrs = {} for variable in model_description.modelVariables: vrs[variable.name] = variable.valueReference # extract the FMU unzipdir = fmpy.extract(fmu_filename) fmu_args = { 'guid': model_description.guid, 'modelIdentifier': model_description.coSimulation.modelIdentifier, 'unzipDirectory': unzipdir } # get the value references for the start and output values start_vrs = [vrs['VAC'], vrs['IDC']] result_vrs = [vrs['uDC'], vrs['Losses']] indices = list(np.ndindex(I_DC.shape)) chunks = [] chunk_size = int(np.ceil(len(indices) / 10)) # split the indices into 10 chunks for i in range(0, len(indices), chunk_size): chunks.append( [indices[i:i + chunk_size], fmu_args, start_vrs, result_vrs]) print("Running %d simulations (%d chunks)..." % (V_AC.size, len(chunks))) with ProgressBar(): # calculate the losses for every chunk results = bag.from_sequence(chunks).map(simulate_fmu).compute() LOSSES = np.zeros_like(V_AC) # put the results together for zipped, dll_handle in results: for i, res in zipped: LOSSES[i] = res[1] # unload the shared library if sync: while True: try: fmpy.freeLibrary(dll_handle) except: break # clean up shutil.rmtree(unzipdir) if show_plot: print("Plotting results...") import matplotlib.pyplot as plt figure = plt.figure() figure.patch.set_facecolor('white') ax = figure.add_subplot(1, 1, 1) CS = plt.contourf(V_AC, I_DC, LOSSES, 10) plt.colorbar(CS, aspect=30) CS = ax.contour(V_AC, I_DC, LOSSES, 10, colors='k', linewidths=0.8) ax.clabel(CS=CS, fmt='%.0f', fontsize=9, inline=1) ax.set_title('Losses / W') ax.set_xlabel('AC Voltage / V') ax.set_ylabel('DC Current / A') plt.show() else: print("Plotting disabled") print("Done.") return LOSSES
def modify_doc(doc): """Add plots to the document Parameters ---------- doc : [type] A Bokeh document to which plots can be added """ curDir = os.path.dirname(__file__) root = tk.Tk() root.withdraw() ProgressBar().register() filepath = filedialog.askopenfilename() filename = filepath[filepath.rfind('/') + 1:filepath.rfind('.')] monteData = dd.read_csv(filepath) monteData.fillna(0) plotData = monteData.compute() gc.collect() plotData.drop_duplicates(subset='L-string', inplace=True) plotData.reset_index() for i in range(2, 6): plotData['{}-gram'.format(i)] = plotData['L-string'].apply( lambda x: [x[j:j + i] for j in range(0, len(x), i)]) gc.collect() scatter = ColumnDataSource(data=plotData) line = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0])) rule1 = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0])) rule2 = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0])) polygon = ColumnDataSource(data=dict(x=[0], y=[0])) rule1_poly = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0])) rule2_poly = ColumnDataSource(data=dict(x=[0, 0], y=[0, 0])) palette.reverse() mapper = log_cmap(field_name='Area', palette=palette, low=0, high=500) tooltips1 = [ ('index', '$index'), ('F', '@{% of F}{0.0%}'), ('+', '@{% of +}{0.0%}'), ('-', '@{% of -}{0.0%}'), ] tooltips2 = [ ('index', '$index'), ('F', '@{Longest F sequence}'), ('+', '@{Longest + sequence}'), ('-', '@{Longest - sequence}'), ] plots_width = 500 plots_height = 500 p1 = figure(plot_width=plots_width, plot_height=plots_height, tools='pan,wheel_zoom,box_zoom,reset,tap,save', title="Area", output_backend="webgl", tooltips=tooltips1) p1.xaxis.axis_label = 'Area' p1.yaxis.axis_label = '% of character' p1.scatter('Area', '% of F', size=7, source=scatter, color=mapper, alpha=0.6, nonselection_fill_color=mapper) p2 = figure(plot_width=plots_width, plot_height=plots_height, tools='pan,wheel_zoom,box_zoom,reset,tap,save', title="Area", output_backend="webgl", tooltips=tooltips2) p2.xaxis.axis_label = 'Area' p2.yaxis.axis_label = 'Length of sequence' p2.scatter('Area', 'Longest F sequence', size=7, source=scatter, fill_color='red', color=mapper, alpha=0.6, nonselection_fill_color=mapper) p3 = figure(plot_width=plots_width, plot_height=plots_height, tools='pan,wheel_zoom,box_zoom,reset,tap,save', title="Selected Creature", output_backend="webgl") p3.axis.visible = False p3.grid.visible = False p3.line(x='x', y='y', line_color='red', source=line) p3.multi_polygons(xs='x', ys='y', source=polygon) p4 = figure(plot_width=plots_width, plot_height=plots_height, tools='pan,wheel_zoom,box_zoom,reset,tap,save', title="Area", output_backend="webgl") p4.scatter('Area', 'Angle', size=7, source=scatter, color=mapper, alpha=0.6, nonselection_fill_color=mapper) p4.xaxis.axis_label = 'Area' p4.yaxis.axis_label = 'Angle (degrees)' p5 = figure(plot_width=plots_width, plot_height=plots_height // 2, title="Rule 1", output_backend="webgl") p5.line(x='x', y='y', line_color='red', source=rule1) p5.multi_polygons(xs='x', ys='y', source=rule1_poly) p5.axis.visible = False p5.grid.visible = False p6 = figure(plot_width=plots_width, plot_height=plots_height // 2, title="Rule 2", output_backend="webgl") p6.line(x='x', y='y', line_color='red', source=rule2) p6.multi_polygons(xs='x', ys='y', source=rule2_poly) p6.axis.visible = False p6.grid.visible = False L_string = Paragraph(text='Select creature', width=1500) grams = PreText(text='Select creature', width=400) rule_text = PreText(text='Select creature', width=400) area_label = Label( x=0, y=450, x_units='screen', y_units='screen', text='Select creature', render_mode='css', border_line_color='black', border_line_alpha=1.0, background_fill_color='white', background_fill_alpha=1.0, ) length_label = Label( x=0, y=420, x_units='screen', y_units='screen', text='Select creature', render_mode='css', border_line_color='black', border_line_alpha=1.0, background_fill_color='white', background_fill_alpha=1.0, ) p3.add_layout(area_label) p3.add_layout(length_label) def plot_source(coords): """[summary] Returns ------- [type] [description] """ instance_linestring = LineString(coords[:, 0:2]) instance_patch = instance_linestring.buffer(0.5) instance_x, instance_y = instance_patch.exterior.coords.xy return instance_x, instance_y def mapper(string, angle): theta = 0 num_chars = len(string) coords = np.zeros((num_chars + 1, 3), np.double) def makeRotMat(theta): rotMat = np.array(((cos(theta), -sin(theta), 0), (sin(theta), cos(theta), 0), (0, 0, 1))) return rotMat rotVec = makeRotMat(theta) dir_vec = np.array((0, 1, 0), np.float64) i = 1 for c in string: if c == 'F': coords[i] = (coords[i - 1] + (1 * dir_vec)) i += 1 if c == '-': theta = theta - angle rotVec = makeRotMat(theta) dir_vec = np.dot(rotVec, dir_vec) if c == '+': theta = theta + angle rotVec = makeRotMat(theta) dir_vec = np.dot(rotVec, dir_vec) coords = np.delete(coords, np.s_[i:], 0) return coords def plot_creature(event): line.data = dict(x=[0, 0], y=[0, 0]) polygon.data = dict(x=[0, 0], y=[0, 0]) rule1.data = dict(x=[0, 0], y=[0, 0]) rule2.data = dict(x=[0, 0], y=[0, 0]) rule1_poly.data = dict(x=[0, 0], y=[0, 0]) rule2_poly.data = dict(x=[0, 0], y=[0, 0]) L_string.text = 'Select creature' area_label.text = 'Select creature' length_label.text = 'Select creature' rule_text.text = 'Select creature' if len(scatter.selected.indices) > 0: creature_index = scatter.selected.indices[0] creature = plotData.iloc[creature_index, :] coords = np.array(ast.literal_eval(creature['Coordinates'])) L_string.text = '{}'.format(creature['L-string']) area_label.text = 'Area: {:.2f}'.format(creature['Area']) length_label.text = 'Length of L-string: {}'.format( len(creature['L-string'])) gram_frame_1 = pd.DataFrame.from_dict( { '2-gram': creature['2-gram'], '3-gram': creature['3-gram'], '4-gram': creature['4-gram'], '5-gram': creature['5-gram'], }, orient='index').T counts = [ pd.value_counts( gram_frame_1[i]).reset_index().astype(str).apply( ' '.join, 1) for i in gram_frame_1 ] out = pd.concat(counts, 1).fillna('') out.columns = gram_frame_1.columns grams.text = str(tabulate(out, headers='keys')) creature_linestring = LineString(coords[:, 0:2]) creature_patch = creature_linestring.buffer(0.5) patch_x, patch_y = creature_patch.exterior.coords.xy x_points = [list(patch_x)] y_points = [list(patch_y)] for i, _ in enumerate(creature_patch.interiors): x_in, y_in = creature_patch.interiors[i].coords.xy x_points.append(list(x_in)) y_points.append(list(y_in)) x_points = [[x_points]] y_points = [[y_points]] line.data = dict(x=coords[:, 0], y=coords[:, 1]) polygon.data = dict(x=x_points, y=y_points) p3.match_aspect = True rules = ast.literal_eval(creature['Rules']) rules = rules['X'] rules = rules['options'] rule_text.text = 'Rule 1: \t' + \ rules[0] + '\n' + 'Rule 2: \t' + rules[1] if any(char == 'F' for string in rules[0] for char in string): rule1_c = mapper(rules[0], creature['Angle']) rule1_morphology = LineString(rule1_c[:, 0:2]) rule1_patch = rule1_morphology.buffer(0.5) rpatch_x, rpatch_y = rule1_patch.exterior.coords.xy r1_points_x = [list(rpatch_x)] r1_points_y = [list(rpatch_y)] for i, _ in enumerate(rule1_patch.interiors): x_in, y_in = creature_patch.interiors[i].coords.xy r1_points_x.append(list(x_in)) r1_points_y.append(list(y_in)) r1_points_x = [[r1_points_x]] r1_points_y = [[r1_points_y]] rule1.data = dict(x=rule1_morphology.coords.xy[0], y=rule1_morphology.coords.xy[1]) rule1_poly.data = dict(x=r1_points_x, y=r1_points_y) p5.match_aspect = True if any(char == 'F' for string in rules[1] for char in string): rule2_c = mapper(rules[1], creature['Angle']) rule2_morphology = LineString(rule2_c[:, 0:2]) rule2_patch = rule2_morphology.buffer(0.5) r2patch_x, r2patch_y = rule2_patch.exterior.coords.xy r2_points_x = [list(r2patch_x)] r2_points_y = [list(r2patch_y)] for i, _ in enumerate(rule2_patch.interiors): x_in, y_in = creature_patch.interiors[i].coords.xy r2_points_x.append(list(x_in)) r2_points_y.append(list(y_in)) r2_points_x = [[r2_points_x]] r2_points_y = [[r2_points_y]] rule2.data = dict(x=rule2_morphology.coords.xy[0], y=rule2_morphology.coords.xy[1]) rule2_poly.data = dict(x=r2_points_x, y=r2_points_y) p6.match_aspect = True else: line.data = dict(x=[0, 0], y=[0, 0]) polygon.data = dict(x=[0, 0], y=[0, 0]) rule1.data = dict(x=[0, 0], y=[0, 0]) rule2.data = dict(x=[0, 0], y=[0, 0]) rule1_poly.data = dict(x=[0, 0], y=[0, 0]) rule2_poly.data = dict(x=[0, 0], y=[0, 0]) L_string.text = 'Select creature' area_label.text = 'Select creature' length_label.text = 'Select creature' rule_text.text = 'Select creature' p1.on_event(Tap, plot_creature) p2.on_event(Tap, plot_creature) p4.on_event(Tap, plot_creature) top_row = row(L_string) middle_row = row(p1, p2, p4) bottom_row_right = column(p5, p6) bottom_row_middle = column(grams, rule_text) bottom_row = row(p3, Spacer(width=50), bottom_row_middle, Spacer(width=50), bottom_row_right) layout = column(top_row, middle_row, bottom_row) doc.add_root(layout)
def lasso_tuning(alpha=[1e-15, 1e-10, 1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 1, 5, 10], k=5, train_data_path='../data/training_data.csv', save_model=False, tracking_uri="http://0.0.0.0:5000"): # Log the parameters with mlflow mlflow.log_param("alpha", alpha) mlflow.set_tag("k", k) # Set random seed for reproducibility np.random.seed(RANDOM_SEED) random.seed(RANDOM_SEED) # Get data shuffled and split into training and test sets mdr = MiningDataReader(path=train_data_path) (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data() pipeline = Pipeline(steps=[( 'scaling', StandardScaler()), ('regression', Lasso(random_state=RANDOM_SEED))]) ### TRAINING ### ################ # Generate grid search for hyperparam tuning hyperparams = {} hyperparams['regression__alpha'] = alpha print("Training started...\n") # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors modelCV = GridSearchCV(estimator=pipeline, param_grid=hyperparams, cv=k, scoring='neg_mean_squared_error', n_jobs=-1) with ProgressBar(): modelCV.fit(X_train, y_train) # Iterate over the results storing training error for each hyperparameter combination results = modelCV.cv_results_ param_list, training_err_list, training_dev_list = [], [], [] for i in range(len(results['params'])): param = results['params'][i] score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE std = results['std_test_score'][i] param_list.append(param) training_err_list.append(score) training_dev_list.append(std) print( f"\nBest parameter set found for the training set:\n{modelCV.best_params_}" ) # Store the index of the best combination best_index = param_list.index(modelCV.best_params_) # Get the best values for hyperparams best_alpha = modelCV.best_params_['regression__alpha'] print("\nTraining finished. Evaluating model...\n") ### EVALUATION ### ################## # Criteria is the number of trees criteria = 'alpha' mlflow.set_tag("criteria", criteria) param_values = alpha # Predict test data variying criteria param and evaluate the models training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], [] rmse_score, mae_score, r2_score = -1, -1, -1 feature_names, feature_importances = [], [] for param_value in tqdm(param_values): model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', Lasso(alpha=param_value, random_state=RANDOM_SEED))]) param = {'regression__alpha': param_value} # Fit model and evaluate results model.fit(X_train, y_train) prediction = model.predict(X_test) index = param_list.index(param) training_err = training_err_list[index] training_dev = training_dev_list[index] (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction) # Store metrics training_err_by_criteria.append(training_mse) training_dev_by_criteria.append(training_dev) test_err_list.append(test_mse) # Set aditional metrics for the best combination if index == best_index: rmse_score = rmse mae_score = mae r2_score = r2 # Generate the plots empty_img_folder() plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list) # Once hyperparameters are selected, train and save the best model if save_model: print( "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..." ) final_model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', Lasso(alpha=param_list[best_index]['regression__alpha']))]) # Train the best model with all the data (training + test) full_X = np.vstack((X_train, X_test)) full_y = np.concatenate((y_train, y_test)) final_model.fit(full_X, full_y) # Log plots and model with mlflow mlflow.log_artifacts('./img') mlflow.sklearn.log_model(final_model, 'model') # Log results with mlflow mlflow.log_metric("train_mse", training_err_list[best_index]) mlflow.log_metric("test_mse", min(test_err_list)) mlflow.log_metric("rmse", rmse_score) mlflow.log_metric("mae", mae_score) mlflow.log_metric("r2", r2_score) mlflow.set_tag("best_params", param_list[best_index]) # Output the results print(f''' ----------------------------------------------------------------------------------------------------------------------- RESULTS ----------------------------------------------------------------------------------------------------------------------- Best params: {param_list[best_index]} Training MSE: {training_err_list[best_index]} Test MSE: {min(test_err_list)} RMSE: {rmse_score} MAE: {mae_score} R2: {r2_score} ----------------------------------------------------------------------------------------------------------------------- ''')
def netcdf_to_ascii(homedir, subdir, source_directory, mappingfile, catalog_label, meta_file, temporal_resolution='D', netcdfs=None, variable_list=None): # initialize list of dataframe outputs outfiledict = {} # generate destination folder filedir = os.path.join(homedir, subdir) ogh.ensure_dir(filedir) # connect with collection of netcdfs if isinstance(netcdfs, type(None)): netcdfs = [ os.path.join(source_directory, file) for file in os.listdir(source_directory) if file.endswith('.nc') ] ds_mf = xray.open_mfdataset(netcdfs, engine='netcdf4').sortby('TIME') # generate list of variables if not isinstance(variable_list, type(None)): ds_vars = variable_list.copy() else: ds_vars = [ ds_var for ds_var in dict(ds_mf.variables).keys() if ds_var not in ['YEAR', 'MONTH', 'DAY', 'TIME', 'LAT', 'LON'] ] # convert netcdfs to pandas.Panel API ds_pan = ds_mf.to_dataframe()[ds_vars] # read in gridded cells of interest maptable, nstation = ogh.mappingfileToDF(mappingfile, colvar=None, summary=False) # at each latlong of interest for ind, eachrow in maptable.iterrows(): # generate ASCII time-series ds_df = ds_pan.loc[eachrow['LAT'], eachrow['LONG_'], :].reset_index(drop=True, level=[0, 1]) # create file name outfilename = os.path.join( filedir, 'data_{0}_{1}'.format(eachrow['LAT'], eachrow['LONG_'])) # save ds_df outfiledict[outfilename] = da.delayed(ds_df.to_csv)( path_or_buf=outfilename, sep='\t', header=False, index=False) # compute ASCII time-series files ProgressBar().register() outfiledict = da.compute(outfiledict)[0] # annotate metadata file meta_file[catalog_label] = dict(ds_mf.attrs) meta_file[catalog_label]['variable_list'] = list(np.array(ds_vars)) meta_file[catalog_label]['delimiter'] = '\t' meta_file[catalog_label]['start_date'] = pd.Series( ds_mf.TIME).sort_values().iloc[0].strftime('%Y-%m-%d %H:%M:%S') meta_file[catalog_label]['end_date'] = pd.Series( ds_mf.TIME).sort_values().iloc[-1].strftime('%Y-%m-%d %H:%M:%S') meta_file[catalog_label]['temporal_resolution'] = temporal_resolution meta_file[catalog_label]['variable_info'] = dict(ds_mf.variables) # catalog the output files ogh.addCatalogToMap(outfilepath=mappingfile, maptable=maptable, folderpath=filedir, catalog_label=catalog_label) os.chdir(homedir) return (list(outfiledict.keys()))
def lr_deconvolution(image, psf, iterations=50): """ Tiled Lucy-Richardson deconvolution using DECON_LIBRARY :param image: ndarray raw data :param psf: ndarray theoretical PSF :param iterations: int number of iterations to run :return deconvolved: ndarray deconvolved image """ # create dask array scan_chunk_size = 512 if image.shape[0] < scan_chunk_size: dask_raw = da.from_array(image, chunks=(image.shape[0], image.shape[1], image.shape[2])) overlap_depth = (0, 2 * psf.shape[1], 2 * psf.shape[1]) else: dask_raw = da.from_array(image, chunks=(scan_chunk_size, image.shape[1], image.shape[2])) overlap_depth = 2 * psf.shape[0] del image gc.collect() if DECON_LIBRARY == 'dexp': # define dask dexp partial function for GPU LR deconvolution lr_dask = partial(dexp_lr_decon, psf=psf, num_iterations=iterations, padding=2 * psf.shape[0], internal_dtype=np.float16) else: lr_dask = partial(mv_lr_decon, psf=psf, num_iterations=iterations) # create dask plan for overlapped blocks dask_decon = da.map_overlap(lr_dask, dask_raw, depth=overlap_depth, boundary=None, trim=True, meta=np.array((), dtype=np.uint16)) # perform LR deconvolution in blocks if DECON_LIBRARY == 'dexp': with CupyBackend(enable_cutensor=True, enable_cub=True, enable_fft_planning=True): with ProgressBar(): decon_data = dask_decon.compute(scheduler='single-threaded') else: with ProgressBar(): decon_data = dask_decon.compute(scheduler='single-threaded') # clean up memory cp.clear_memo() del dask_decon gc.collect() return decon_data.astype(np.uint16)
def readData(self, runNumber=None, pulseIdInterval=None, path=None): """Read data by run number or macrobunch pulseID interval. Useful for scans that would otherwise hit the machine's memory limit. **Parameters**\n runNumber: int | None (default to ``self.runNumber``) number of the run from which to read data. If None, requires pulseIdInterval. pulseIdInterval: (int, int) | None (default to ``self.pulseIdInterval``) first and last macrobunches of selected data range. If None, the whole run defined by runNumber will be taken. path: str | None (default to ``self.DATA_RAW_DIR``) path to location where raw HDF5 files are stored. This is a union of the readRun and readInterval methods defined in previous versions. """ # Update instance attributes based on input parameters if runNumber is None: runNumber = self.runNumber else: self.runNumber = runNumber if pulseIdInterval is None: pulseIdInterval = self.pulseIdInterval else: self.pulseIdInterval = pulseIdInterval if (pulseIdInterval is None) and (runNumber is None): raise ValueError('Need either runNumber or pulseIdInterval to know what data to read.') if path is not None: try: daqAccess = BeamtimeDaqAccess.create(path) except: self.path_to_run = misc.get_path_to_run(runNumber, path) daqAccess = BeamtimeDaqAccess.create(self.path_to_run) else: path = self.DATA_RAW_DIR self.path_to_run = misc.get_path_to_run(runNumber, path) daqAccess = BeamtimeDaqAccess.create(self.path_to_run) self.daqAddresses = [] self.pulseIdInterval = self.getIds(runNumber, path) # Parse the settings file in the DAQ channels section for the list of # h5 addresses to read from raw and add to the dataframe. print('loading data...') for name, entry in self.settings['DAQ channels'].items(): name = misc.camelCaseIt(name) val = str(entry) if daqAccess.isChannelAvailable(val, self.pulseIdInterval): self.daqAddresses.append(name) if _VERBOSE: print('assigning address: {}: {}'.format(name.ljust(20), val)) setattr(self, name, val) else: # if _VERBOSE: print('skipping address missing from data: {}: {}'.format(name.ljust(20), val)) # TODO: get the available pulse id from PAH if pulseIdInterval is None: print('Reading DAQ data from run {}... Please wait...'.format(runNumber)) for address_name in self.daqAddresses: if _VERBOSE: print('reading address: {}'.format(address_name)) try: attrVal = getattr(self, address_name) values, otherStuff = daqAccess.allValuesOfRun(attrVal, runNumber) except AssertionError: print('Assertion error: {}'.format(address_name, attrVal, values, otherStuff )) setattr(self, address_name, values) if address_name == 'macroBunchPulseId': # catch the value of the first macrobunchID pulseIdInterval = (otherStuff[0], otherStuff[-1]) self.pulseIdInterval = pulseIdInterval macroBunchPulseId_correction = pulseIdInterval[0] if address_name == 'timeStamp': # catch the time stamps startEndTime = (values[0,0], values[-1,0]) self.startEndTime = startEndTime numOfMacrobunches = pulseIdInterval[1] - pulseIdInterval[0] else: print('reading DAQ data from interval {}'.format(pulseIdInterval)) self.pulseIdInterval = pulseIdInterval for address_name in self.daqAddresses: if _VERBOSE: print('reading address: {}'.format(address_name)) setattr(self, address_name, daqAccess.valuesOfInterval(getattr(self, address_name), pulseIdInterval)) numOfMacrobunches = pulseIdInterval[1] - pulseIdInterval[0] macroBunchPulseId_correction = pulseIdInterval[0] # necessary corrections for specific channels: try: self.delayStage = self.delayStage[:, 1] except: try: self.delayStage = self.delayStage[:, 0] print('1030nm Laser') except: print('no delay stage') self.macroBunchPulseId -= macroBunchPulseId_correction self.dldMicrobunchId -= self.UBID_OFFSET if _VERBOSE: print('Counting electrons...') electronsToCount = self.dldPosX.copy().flatten() electronsToCount = np.nan_to_num(electronsToCount) electronsToCount = electronsToCount[electronsToCount > 0] electronsToCount = electronsToCount[electronsToCount < 10000] self.numOfElectrons = len(electronsToCount) self.electronsPerMacrobunch = int(self.numOfElectrons / numOfMacrobunches) self.runInfo = { 'runNumber':self.runNumber, 'pulseIdInterval':self.pulseIdInterval, 'numberOfMacrobunches': numOfMacrobunches, 'numberOfElectrons':self.numOfElectrons, 'electronsPerMacrobunch': self.electronsPerMacrobunch, } try: self.runInfo['timestampStart'] = self.startEndTime[0].astype(int) self.runInfo['timestampStop'] = self.startEndTime[1].astype(int) self.runInfo['timestampDuration'] = self.startEndTime[1]-self.startEndTime[0].astype(int) self.runInfo['timeStart'] = datetime.utcfromtimestamp(self.startEndTime[0]).strftime('%Y-%m-%d %H:%M:%S') self.runInfo['timeStop'] = datetime.utcfromtimestamp(self.startEndTime[1]).strftime('%Y-%m-%d %H:%M:%S') self.runInfo['timeDuration'] = datetime.timedelta(self.startEndTime[1]-self.startEndTime[0]) except: self.runInfo['timestampStart'] = None self.runInfo['timestampStop'] = None self.runInfo['timestampDuration'] = None self.runInfo['timeStart'] = None self.runInfo['timeStop'] = None self.runInfo['timeDuration'] = None self.printRunOverview() # Old Print style # print('Run {0} contains {1:,} Macrobunches, from {2:,} to {3:,}' \ # .format(runNumber, numOfMacrobunches, pulseIdInterval[0], pulseIdInterval[1])) # try: # print("start time: {}, end time: {}, total time: {}" # .format(datetime.utcfromtimestamp(startEndTime[0]).strftime('%Y-%m-%d %H:%M:%S'), # datetime.utcfromtimestamp(startEndTime[1]).strftime('%Y-%m-%d %H:%M:%S'), # datetime.utcfromtimestamp(startEndTime[1] - startEndTime[0]).strftime('%H:%M:%S'))) # except: # pass # # print("Number of electrons: {0:,}; {1:,} e/Mb ".format(self.numOfElectrons, self.electronsPerMacrobunch)) print("Creating dataframes... Please wait...") with ProgressBar(): self.createDataframePerElectron() print('Electron dataframe created.') self.createDataframePerMicrobunch() print('Microbunch dataframe created.') print('Reading Complete.')
def test_no_tasks(capsys): with ProgressBar(): get({'x': 1}, 'x') check_bar_completed(capsys)
def test_minimum_time(capsys): with ProgressBar(1.0): out = get(dsk, 'e') out, err = capsys.readouterr() assert out == '' and err == ''
def predict(args): # Numpy arrays # Convert source data into dask arrays radec, stokes = parse_sky_model(args.sky_model) radec = da.from_array(radec, chunks=(SOURCE_CHUNKS, 2)) stokes = da.from_array(stokes, chunks=(SOURCE_CHUNKS, 4)) # Get the support tables tables = support_tables(args, ["FIELD", "DATA_DESCRIPTION", "SPECTRAL_WINDOW", "POLARIZATION"]) field_ds = tables["FIELD"] ddid_ds = tables["DATA_DESCRIPTION"] spw_ds = tables["SPECTRAL_WINDOW"] pol_ds = tables["POLARIZATION"] # List of write operations writes = [] # Construct a graph for each DATA_DESC_ID for xds in xds_from_ms(args.ms, columns=["UVW", "ANTENNA1", "ANTENNA2", "TIME"], group_cols=["FIELD_ID", "DATA_DESC_ID"], chunks={"row": args.row_chunks}): # Extract frequencies from the spectral window associated # with this data descriptor id field = field_ds[xds.attrs['FIELD_ID']] ddid = ddid_ds[xds.attrs['DATA_DESC_ID']] spw = spw_ds[ddid.SPECTRAL_WINDOW_ID.values] pol = pol_ds[ddid.POLARIZATION_ID.values] frequency = spw.CHAN_FREQ.data corrs = pol.NUM_CORR.values lm = radec_to_lm(radec, field.PHASE_DIR.data) uvw = -xds.UVW.data if args.invert_uvw else xds.UVW.data # (source, row, frequency) phase = phase_delay(lm, uvw, frequency) brightness = convert(stokes, ["I", "Q", "U", "V"], corr_schema(pol)) # (source, row, frequency, corr1, corr2) jones = da.einsum(einsum_schema(pol), phase, brightness) # Identify time indices _, time_index = da.unique(xds.TIME.data, return_inverse=True) # Predict visibilities vis = predict_vis(time_index, xds.ANTENNA1.data, xds.ANTENNA2.data, None, jones, None, None, None, None) # Reshape (2, 2) correlation to shape (4,) if corrs == 4: vis = vis.reshape(vis.shape[:2] + (4,)) # Assign visibilities to MODEL_DATA array on the dataset model_data = xr.DataArray(vis, dims=["row", "chan", "corr"]) xds = xds.assign(MODEL_DATA=model_data) # Create a write to the table write = xds_to_table(xds, args.ms, ['MODEL_DATA']) # Add to the list of writes writes.append(write) # Submit all graph computations in parallel with ProgressBar(): dask.compute(writes)
def predict_xr(model, input_xr, progress=True): """ Utilise our wrappers to predict with a vanilla sklearn model. Last modified: September 2019 Parameters ---------- model : a scikit-learn model or compatible object Must have a predict() method that takes numpy arrays. input_xr : xarray.DataArray or xarray.Dataset Must have dimensions 'x' and 'y', may have dimension 'time'. Returns ---------- output_xr : xarray.DataArray An xarray.DataArray containing the prediction output from model with input_xr as input. Has the same spatiotemporal structure as input_xr. """ def _get_class_ufunc(*args): """ ufunc to apply classification to chunks of data """ input_data_flattened = [] for data in args: input_data_flattened.append(data.flatten()) # Flatten array input_data_flattened = np.array(input_data_flattened).transpose() # Mask out no-data in input (not all classifiers can cope with # Inf or NaN values) input_data_flattened = np.where(np.isfinite(input_data_flattened), input_data_flattened, 0) # Actually apply the classification out_class = model.predict(input_data_flattened) # Mask out NaN or Inf values in results out_class = np.where(np.isfinite(out_class), out_class, 0) # Reshape when writing out return out_class.reshape(args[0].shape) def _get_class(*args): """ Apply classification to xarray DataArrays. Uses dask to run chunks at a time in parallel """ out = xr.apply_ufunc(_get_class_ufunc, *args, dask='parallelized', output_dtypes=[np.uint8]) return out # Set up a list of input data using variables passed in input_data = [] for var_name in input_xr.data_vars: input_data.append(input_xr[var_name]) # Run through classification. Need to expand and have a separate # dataframe for each variable so chunking in dask works. if progress: with ProgressBar(): out_class = _get_class(*input_data).compute() else: out_class = _get_class(*input_data).compute() # Set the stacked coordinate to match the input output_xr = xr.DataArray(out_class, coords=input_xr.coords) return output_xr
def analyze_samples(data, features, copy=False): """Calculate the set of specified `features` for every sample, defined as the set of molecules corresponding to every cell-gene pair. Parameters ---------- data : AnnData Spatially formatted AnnData features : list of :class:`SampleFeature` List of :class:`SampleFeature` to compute. chunks : int, optional Number of partitions to use, passed to `dask`, by default None. chunksize : int, optional Size of partitions, passed to `dask`, by default None. copy : bool Return a copy of `data` instead of writing to data, by default False. Returns ------- adata : anndata.AnnData Returns `adata` if `copy=True`, otherwise adds fields to `data`: `.layers[`keys`]` See the output of each :class:`SampleFeature` in `features` for keys added. """ adata = data.copy() if copy else data pbar = tqdm(desc="Cell features", total=3) # Cast features to type list if not isinstance(features, list): features = [features] features = [sample_features[f] for f in features] cell_features = set() # Cell-level fns to run cell_attributes = set( ) # Cell-level attributes needed to compute sample features for f in features: cell_features.update(f.cell_features) cell_attributes.update(f.cell_attributes) cell_features = list(cell_features) cell_attributes = list(cell_attributes) tl.analyze_cells(adata, cell_features, progress=False) # Make sure attributes are present attrs_found = set(cell_attributes).intersection( set(adata.obs.columns.tolist())) if len(attrs_found) != len(cell_attributes): raise KeyError(f"df does not have all columns: {cell_attributes}.") pbar.update() pbar.set_description("Sample features") # extract cell attributes points_df = (get_points(adata, asgeo=True).set_index("cell").join( data.obs[cell_attributes]).reset_index().sort_values( ["cell", "gene"]).reset_index(drop=True)) # Handle categories as strings to avoid ambiguous cat types for col in points_df.loc[:, (points_df.dtypes == 'category').values]: points_df[col] = points_df[col].astype(str) # Handle shape indexes as strings to avoid ambiguous types for shape_name in adata.obs.columns[adata.obs.columns.str.endswith( '_shape')]: shape_prefix = '_'.join(shape_name.split('_')[:-1]) if shape_prefix in points_df.columns: points_df[shape_prefix] = points_df[shape_prefix].astype(str) # Calculate features for a sample def process_sample(df): sample_output = {} for f in features: sample_output.update(f.extract(df)) return sample_output # Process all samples in a partition def process_partition(partition_df): return partition_df.groupby(["cell", "gene"], observed=True).apply(process_sample) # Cast to dask dataframe ddf = dask_geopandas.from_geopandas(points_df, npartitions=1) # Partition so only 1000 groups per groupby _, group_loc = np.unique( points_df["cell"].astype(str) + "-" + points_df["gene"].astype(str), return_index=True, ) divisions = [group_loc[loc] for loc in range(0, len(group_loc), 1000)] divisions.append(len(points_df) - 1) ddf = ddf.repartition(divisions=divisions) # Parallel process each partition with ProgressBar(): # Run on a single sample to get output metadata meta_output = process_partition(points_df.head()) meta = pd.DataFrame(meta_output.tolist(), index=meta_output.index) output = ddf.map_partitions(process_partition, meta=meta.dtypes).compute() pbar.update() pbar.set_description("Saving to AnnData") # Format from Series of dicts to DataFrame output = pd.DataFrame(output.tolist(), index=output.index).reset_index() # Save results to data layers feature_names = output.columns[~output.columns.isin(["cell", "gene"])] for feature_name in feature_names: adata.layers[feature_name] = (output.pivot( index="cell", columns="gene", values=feature_name).reindex( index=adata.obs_names, columns=adata.var_names).astype(float)) pbar.update() pbar.set_description('Done!') pbar.close()
def sumTokenCounts(stores,data): max_str_bytes = 50 chunksize = 100000 batch_limit = 6*10**8 savestore = data + "final/fromnodes-323.h5" for storefile in stores: print(storefile) logging.info("Next store: %s" % storefile) try: # Get Unique languages with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store: langs = set([key.split("/", maxsplit=-1)[-1] for key in store.keys() if 'merged1' in key]) except: logging.exception("Can't read languages from %s" % storefile) continue for lang in langs: batch = False logging.info("Starting lang %s from %s" % (lang, storefile)) print(lang) if not re.match('[a-z]{3}', lang): logging.error("lang '%s' is not three alphanumeric characters. Skipping for now. (%s)" % (lang, storefile)) continue try: ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize, mode='r') except: logging.exception("Can't load Dask DF for %s in %s" % (lang, storefile)) continue # Assuming partitions are equally sized, which they should be if read from a single file if ddf.npartitions > np.ceil(batch_limit/chunksize): batch = True niters = np.floor((ddf.npartitions*chunksize)/batch_limit) i = 0 while True: if batch: start = i * batch_limit logging.info("Starting batch %d for %s" % (i, lang)) if i == niters: # Last batch, no stop value ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize, start=start) batch = False else: ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize,start=start, stop=(start+batch_limit)) i += 1 try: logging.info("Starting full merge for %s with %d partitions" % (lang, ddf.npartitions)) with ProgressBar(): full_merge = ddf.reset_index().groupby('token').sum().compute() #if lang == 'eng': # For curiosity: see the profiling for English # prof.visualize() logging.info("Success! Saving merged.") # The /fromnodes table is the sum from all the different stores, but will need to be summed one more time with pd.HDFStore(savestore, complevel=9, mode="a", complib='blosc') as store: store.append(lang,full_merge,data_columns=['count'],min_itemsize = {'index': max_str_bytes}) except: logging.exception("Can't compute or save lang for %s in %s" % (lang, storefile)) if batch == False: break
def to_dask(self, pages=None, persist=False, progress=True): try: import dask except ImportError: raise RuntimeError("Dask is not installed.") if progress: from dask.diagnostics import ProgressBar ProgressBar().register() if pages is None: pages = self.page_numbers columns = [(k, DASK_TYPE_MAPPING[v.get("type", 'string')]) for k, v in self.schema.items() if k in self.fields and not k.startswith("_")] column_types = dict(columns) url = self._url client_kwargs = self.session.get_client_kwargs() if client_kwargs["app"] is not None: client_kwargs["app"] = dict(client_kwargs["app"].config) def get_data(params): import httpx if client_kwargs["app"] is not None: from eve import Eve client_kwargs["app"] = Eve(settings=client_kwargs["app"]) items = [] with httpx.Client(**client_kwargs) as client: try: resp = client.get( url, params=params, ) items = resp.json().get("_items", []) except: pass data = [{ k: column_types[k](v) for k, v in item.items() if k in column_types } for item in items] return data if not self.is_tabular: import dask.bag as db return db.from_sequence([self.get_page_kwargs(i) for i in pages]).map(get_data).flatten() import dask.dataframe as dd import pandas as pd def get_df(params): data = get_data(params) return pd.DataFrame(data, columns=list(column_types)) dask_name = str( hash((self.name, ) + tuple(self.get_page_kwargs(1).values()))) dsk = {(dask_name, i - 1): (get_df, self.get_page_kwargs(i)) for i in pages} nitems = self.nitems divisions = list(range(0, nitems, self.items_per_page)) if nitems not in divisions: divisions = divisions + [nitems] df = dd.DataFrame(dsk, dask_name, columns, divisions) if persist: return df.persist() return df
def _load_basic_dataframe(df_file=None, datatype='sim', config='IC86.2012', energy_reco=True, energy_cut_key='reco_log_energy', log_energy_min=None, log_energy_max=None, columns=None, n_jobs=1, verbose=False, compute=True): validate_datatype(datatype) if df_file is not None: files = df_file else: paths = get_config_paths() file_pattern = os.path.join(paths.comp_data_dir, config, datatype, 'processed_hdf', 'nominal' if datatype == 'sim' else '', '*.hdf') files = sorted(glob.glob(file_pattern)) ddf = dd.read_hdf(files, key='dataframe', mode='r', columns=columns, chunksize=10000) # Energy reconstruction if energy_reco: model_dict = load_trained_model( 'linearregression_energy_{}'.format(config), return_metadata=True) pipeline = model_dict['pipeline'] feature_list = list(model_dict['training_features']) def add_reco_energy(partition): partition['reco_log_energy'] = pipeline.predict( partition[feature_list]) partition['reco_energy'] = 10**partition['reco_log_energy'] return partition ddf = ddf.map_partitions(add_reco_energy) # Energy range cut if log_energy_min is not None and log_energy_max is not None: def apply_energy_cut(partition): energy_mask = (partition[energy_cut_key] > log_energy_min) & ( partition[energy_cut_key] < log_energy_max) return partition.loc[energy_mask, :] ddf = ddf.map_partitions(apply_energy_cut) if compute: if verbose: pbar = ProgressBar() pbar.register() scheduler = 'processes' if n_jobs > 1 else 'synchronous' df = ddf.compute(scheduler=scheduler, num_workers=n_jobs) df = df.reset_index(drop=True) else: df = ddf return df
def cook_data(filepath, threshold, maxamp, Nchannel, Ychannel, outpath="", model_path="", CNN_window=300, baseline_int_window=20, lg_baseline_offset=0, sg_baseline_offset=0, frac=0.3, lg=0, sg=0, cleanUp=False, blocksize=25 * 10**6, repatition_factor=16): """Uses dask to process the txt output of WaveDump on all available logical cores and return a simple dataframe in parquet format \nfilepath = Path to file. Use * as a wildcard to read multiple textfile: e.g. file*.txt, will read file1.txt, file2.txt, file3.txt, etc into the same dataframe. \nthreshold = Wavedump triggers on all channels when one channel triggers, so to throw away empty events we must reenforce the threshold. \nmaxamp = Max amplitude varies with the offset used in the wavedump config file. If a pulse reaches the maxAmp then we want to throw it away as it is likely to have some part cut off. \nNchannel, Ychannel = the channel numbers in which neutron and gamma detectors are placed \noutpath = The path where the resulting dataframe is stored. \nbaseline_integration_window =20 integer number of bins used in baseline determination. \nlg/sg_baseline_offset = the baseline offset we use when integrating pulses, in order to compensate for underflow, and in order to rotate or linearize psd spectrum \nfine_baseline_offset: The baseline is forced to be an integer. The non integer part is multiplied by 1000 and cast to an int for later use in pulse integration. \ncleanUp: Boolean, wether to write events that \'failed\' for various reason (cfd trig fail or wobbly baseline). These events will be a small fraction, provided a reasonable threshold was applied. By default this parameter is false since they can be filter out using query(\'invalid==False\'), and are useful for debugging and only take up a little space. \nfrac=0.3 = the fraction of peak amplitude used in the cfd algorithm. \nlg=200 = the width of the longgate integration window in nanoseconds, \nsg=22 = width of the shortgate integration window in nanoseconds, \nblocksize=25*10**6 = The amount of data in bytes that will be processed on each thread/logica core. Experiment to find a value that works for your machine specs. Likely it will be between 10 and 100 MB""" filesize = os.stat(filepath).st_size Nblocks = int(round(0.5 + (filesize / blocksize / repatition_factor))) print('processing ', filesize, ' bytes. Will generate', Nblocks, ' blocks') print('Generating lazy instructions.') #==================# # Read in the file # #==================# df = dd.read_csv(filepath, header=None, usecols=[0, 2, 3, 5, 7], names=[ 'window_width', 'channel', 'event_number', 'timestamp', 'samples' ], dtype={ 'window_width': np.int32, 'channel': np.int8, 'event_number': np.int64, 'timestamp': np.int64, 'samples': np.object }, blocksize=blocksize) #====================# # Format the samples # #====================# #first convert the string into an integer array. Then subtract the baseline. df['samples'] = df['samples'].str.split().apply( lambda x: np.array(x, dtype=np.int16), meta=df['samples']) df['samples'] = df['samples'].apply( lambda x: x - int(round(np.average(x[0:baseline_int_window]))), meta=df['samples']) #The baseline is forced to be an integer. The non integer part is multiplied by 1000 and cast to an int for later use in pulse integration. df['fine_baseline_offset'] = np.int16(0) df['fine_baseline_offset'] = df.apply(lambda x: int( 0.5 + 1000 * np.average(x.samples[0:baseline_int_window])), meta=df['fine_baseline_offset'], axis=1) #====================================# # Get amplitude and location of peak # #====================================# df['amplitude'] = df['samples'].apply(lambda x: np.max(np.absolute(x)), meta=df['samples']).astype(np.int16) df['peak_index'] = df['samples'].apply(np.argmin, meta=df['samples']).astype(np.int16) #====================# # Pulse integrations # #====================# # offsetting each bin by a certain baseline offset is equivalent to adding the product # of the integration window and the baseline offset to the integration. df = pulse_integration(df, lg, sg) #=======================# # generate cfd triggers # #=======================# df['cfd_trig_rise'] = np.int32(0) df['cfd_trig_rise'] = df.apply(lambda x: cfd(x, frac=0.3), meta=df['cfd_trig_rise'], axis=1) #===================# # Handle bad events # #===================# #Throw away events whose amplitude is below the threshold #df = df[df['amplitude'] >= ch_thr_mask[df['channel']]] df = df[df['amplitude'] >= threshold] #And those whose amplitude is greater than the expected maximum amplitude (likely have their tops cut off) df['cutoff'] = False df['cutoff'] = df['cutoff'].where(df['amplitude'] < maxamp, True) #df = df[maxamp > df['amplitude']] #and those whose baseline jitters too much. df['baseline_std'] = np.float64(0) df['baseline_std'] = df['samples'].apply( lambda x: np.std(x[0:baseline_int_window]), meta=df['baseline_std']) #df = df[df['baseline_std'] < 2] df['wobbly_baseline'] = False df['wobbly_baseline'] = df['wobbly_baseline'].where( df['baseline_std'] < 2, True) #and those where the cfd triggering failed.cfd_trig_rise = -1 implies error. This occurs if the first bin is #above the cfd trigger point. We also want to ensure that the cfd trigger happens after the baseline determination #and with enough bins following to allow proper lg integration. df['cfd_too_early'] = False df['cfd_too_early'] = df['cfd_too_early'].where( df['cfd_trig_rise'] / 1000 > baseline_int_window, True) #df = df[baseline_int_window*1000 < df['cfd_trig_rise']] #ensure baseline int window df['cfd_too_late_lg'] = False df['cfd_too_late_lg'] = df['cfd_too_late_lg'].where( df['cfd_trig_rise'] / 1000 < (df['window_width'] - lg), True) #df = df[df['cfd_trig_rise'] < 1000*(df['window_width']-lg)] # ensure lg integration window #===========================# #Time of Flight correlations# #===========================# shift = int((Nchannel - Ychannel) / abs(Nchannel - Ychannel)) df = get_tof(df, Nchannel, Ychannel, shift) #=======================================# #Convolutional neural network prediction# #=======================================# df['cfd_too_late_CNN'] = False if (model_path): df = cnn_discrim(df, model_path, CNN_window) #General Goodness parameter df['invalid'] = df['cutoff'] | df['wobbly_baseline'] | df[ 'cfd_too_early'] | df['cfd_too_late_lg'] | df['cfd_too_late_CNN'] #Throw away or keep bad events? I recommend keeping bad events. They can be useful for debugging, and you can choose #not to load them later by choosing 'mode' in load_data_frame() function. if cleanUp == True: df = df[df['invalid'] == False] with ProgressBar(): if (outpath): #repartition the dataframe into fewer (and larger) blocks df = df.repartition(npartitions=df.npartitions // repatition_factor) #save to disk print('Processing dataframe and saving to disk') df.to_parquet(outpath, engine='pyarrow', compression='snappy') return df
def main(argv=sys.argv[1:]): global LOG from satpy import Scene from satpy.resample import get_area_def from satpy.writers import compute_writer_results from dask.diagnostics import ProgressBar from polar2grid.core.script_utils import ( setup_logging, rename_log_file, create_exc_handler) import argparse prog = os.getenv('PROG_NAME', sys.argv[0]) # "usage: " will be printed at the top of this: usage = """ %(prog)s -h see available products: %(prog)s -r <reader> -w <writer> --list-products -f file1 [file2 ...] basic processing: %(prog)s -r <reader> -w <writer> [options] -f file1 [file2 ...] basic processing with limited products: %(prog)s -r <reader> -w <writer> [options] -p prod1 prod2 -f file1 [file2 ...] """ parser = argparse.ArgumentParser(prog=prog, usage=usage, description="Load, composite, resample, and save datasets.") parser.add_argument('-v', '--verbose', dest='verbosity', action="count", default=0, help='each occurrence increases verbosity 1 level through ERROR-WARNING-INFO-DEBUG (default INFO)') parser.add_argument('-l', '--log', dest="log_fn", default=None, help="specify the log filename") parser.add_argument('--progress', action='store_true', help="show processing progress bar (not recommended for logged output)") parser.add_argument('--num-workers', type=int, default=4, help="specify number of worker threads to use (default: 4)") parser.add_argument('--match-resolution', dest='preserve_resolution', action='store_false', help="When using the 'native' resampler for composites, don't save data " "at its native resolution, use the resolution used to create the " "composite.") parser.add_argument('-w', '--writers', nargs='+', help='writers to save datasets with') parser.add_argument("--list-products", dest="list_products", action="store_true", help="List available reader products and exit") subgroups = add_scene_argument_groups(parser) subgroups += add_resample_argument_groups(parser) argv_without_help = [x for x in argv if x not in ["-h", "--help"]] args, remaining_args = parser.parse_known_args(argv_without_help) # get the logger if we know the readers and writers that will be used if args.reader is not None and args.writers is not None: glue_name = args.reader + "_" + "-".join(args.writers or []) LOG = logging.getLogger(glue_name) # add writer arguments if args.writers is not None: for writer in (args.writers or []): parser_func = WRITER_PARSER_FUNCTIONS.get(writer) if parser_func is None: continue subgroups += parser_func(parser) args = parser.parse_args(argv) if args.reader is None: parser.print_usage() parser.exit(1, "\nERROR: Reader must be provided (-r flag).\n" "Supported readers:\n\t{}\n".format('\n\t'.join(['abi_l1b', 'ahi_hsd', 'hrit_ahi']))) if args.writers is None: parser.print_usage() parser.exit(1, "\nERROR: Writer must be provided (-w flag) with one or more writer.\n" "Supported writers:\n\t{}\n".format('\n\t'.join(['geotiff']))) def _args_to_dict(group_actions): return {ga.dest: getattr(args, ga.dest) for ga in group_actions if hasattr(args, ga.dest)} scene_args = _args_to_dict(subgroups[0]._group_actions) load_args = _args_to_dict(subgroups[1]._group_actions) resample_args = _args_to_dict(subgroups[2]._group_actions) writer_args = {} for idx, writer in enumerate(args.writers): sgrp1, sgrp2 = subgroups[3 + idx * 2: 5 + idx * 2] wargs = _args_to_dict(sgrp1._group_actions) if sgrp2 is not None: wargs.update(_args_to_dict(sgrp2._group_actions)) writer_args[writer] = wargs # get default output filename if 'filename' in wargs and wargs['filename'] is None: wargs['filename'] = get_default_output_filename(args.reader, writer) if not args.filenames: parser.print_usage() parser.exit(1, "\nERROR: No data files provided (-f flag)\n") # Prepare logging rename_log = False if args.log_fn is None: rename_log = True args.log_fn = glue_name + "_fail.log" levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] setup_logging(console_level=levels[min(3, args.verbosity)], log_filename=args.log_fn) logging.getLogger('rasterio').setLevel(levels[min(2, args.verbosity)]) sys.excepthook = create_exc_handler(LOG.name) if levels[min(3, args.verbosity)] > logging.DEBUG: import warnings warnings.filterwarnings("ignore") LOG.debug("Starting script with arguments: %s", " ".join(sys.argv)) # Set up dask and the number of workers if args.num_workers: from multiprocessing.pool import ThreadPool dask.config.set(pool=ThreadPool(args.num_workers)) # Parse provided files and search for files if provided directories scene_args['filenames'] = get_input_files(scene_args['filenames']) # Create a Scene, analyze the provided files LOG.info("Sorting and reading input files...") try: scn = Scene(**scene_args) except ValueError as e: LOG.error("{} | Enable debug message (-vvv) or see log file for details.".format(str(e))) LOG.debug("Further error information: ", exc_info=True) return -1 except OSError: LOG.error("Could not open files. Enable debug message (-vvv) or see log file for details.") LOG.debug("Further error information: ", exc_info=True) return -1 if args.list_products: print("\n".join(sorted(scn.available_dataset_names(composites=True)))) return 0 # Rename the log file if rename_log: rename_log_file(glue_name + scn.attrs['start_time'].strftime("_%Y%m%d_%H%M%S.log")) # Load the actual data arrays and metadata (lazy loaded as dask arrays) if load_args['products'] is None: try: reader_mod = importlib.import_module('polar2grid.readers.' + scene_args['reader']) load_args['products'] = reader_mod.DEFAULT_PRODUCTS LOG.info("Using default product list: {}".format(load_args['products'])) except (ImportError, AttributeError): LOG.error("No default products list set, please specify with `--products`.") return -1 LOG.info("Loading product metadata from files...") scn.load(load_args['products']) resample_kwargs = resample_args.copy() areas_to_resample = resample_kwargs.pop('grids') grid_configs = resample_kwargs.pop('grid_configs') resampler = resample_kwargs.pop('resampler') if areas_to_resample is None and resampler in [None, 'native']: # no areas specified areas_to_resample = ['MAX'] elif areas_to_resample is None: raise ValueError("Resampling method specified (--method) without any destination grid/area (-g flag).") elif not areas_to_resample: # they don't want any resampling (they used '-g' with no args) areas_to_resample = [None] has_custom_grid = any(g not in ['MIN', 'MAX', None] for g in areas_to_resample) if has_custom_grid and resampler == 'native': LOG.error("Resampling method 'native' can only be used with 'MIN' or 'MAX' grids " "(use 'nearest' method instead).") return -1 p2g_grid_configs = [x for x in grid_configs if x.endswith('.conf')] pyresample_area_configs = [x for x in grid_configs if not x.endswith('.conf')] if not grid_configs or p2g_grid_configs: # if we were given p2g grid configs or we weren't given any to choose from from polar2grid.grids import GridManager grid_manager = GridManager(*p2g_grid_configs) else: grid_manager = {} if pyresample_area_configs: from pyresample.utils import parse_area_file custom_areas = parse_area_file(pyresample_area_configs) custom_areas = {x.area_id: x for x in custom_areas} else: custom_areas = {} ll_bbox = resample_kwargs.pop('ll_bbox') if ll_bbox: scn = scn.crop(ll_bbox=ll_bbox) wishlist = scn.wishlist.copy() preserve_resolution = get_preserve_resolution(args, resampler, areas_to_resample) if preserve_resolution: preserved_products = set(wishlist) & set(scn.datasets.keys()) resampled_products = set(wishlist) - preserved_products # original native scene to_save = write_scene(scn, args.writers, writer_args, preserved_products) else: preserved_products = set() resampled_products = set(wishlist) to_save = [] LOG.debug("Products to preserve resolution for: {}".format(preserved_products)) LOG.debug("Products to use new resolution for: {}".format(resampled_products)) for area_name in areas_to_resample: if area_name is None: # no resampling area_def = None elif area_name == 'MAX': area_def = scn.max_area() elif area_name == 'MIN': area_def = scn.min_area() elif area_name in custom_areas: area_def = custom_areas[area_name] elif area_name in grid_manager: from pyresample.geometry import DynamicAreaDefinition p2g_def = grid_manager[area_name] area_def = p2g_def.to_satpy_area() if isinstance(area_def, DynamicAreaDefinition) and p2g_def['cell_width'] is not None: area_def = area_def.freeze(scn.max_area(), resolution=(abs(p2g_def['cell_width']), abs(p2g_def['cell_height']))) else: area_def = get_area_def(area_name) if resampler is None and area_def is not None: rs = 'native' if area_name in ['MIN', 'MAX'] else 'nearest' LOG.debug("Setting default resampling to '{}' for grid '{}'".format(rs, area_name)) else: rs = resampler if area_def is not None: LOG.info("Resampling data to '%s'", area_name) new_scn = scn.resample(area_def, resampler=rs, **resample_kwargs) elif not preserve_resolution: # the user didn't want to resample to any areas # the user also requested that we don't preserve resolution # which means we have to save this Scene's datasets # because they won't be saved new_scn = scn to_save = write_scene(new_scn, args.writers, writer_args, resampled_products, to_save=to_save) if args.progress: pbar = ProgressBar() pbar.register() LOG.info("Computing products and saving data to writers...") compute_writer_results(to_save) LOG.info("SUCCESS") return 0
def make_climatology(ds, output_frequency, monthly_weights=False, time_var_name='time', time_dim_name='t_dim', fn_out=None, missing_values=False): ''' Calculates a climatology for all variables in a supplied dataset. The resulting xarray dataset will NOT be loaded to RAM. Instead, it is a set of dask operations. To load to RAM use, e.g. .compute(). However, if the original data was large, this may take a long time and a lot of memory. Make sure you have the available RAM or chunking and parallel processes are specified correctly. Otherwise, it is recommended that you access the climatology data in an indexed way. I.E. compute only at specific parts of the data are once. The resulting cliamtology dataset can be written to disk using .to_netcdf(). Again, this may take a while for larger datasets. ds :: xarray dataset object from a COAsT object. output_frequency :: any xarray groupby string. i.e: 'month' 'season' time_var_name :: the string name of the time variable in dataset time_dim_name :: the string name of the time dimension variable in dataset fn_out :: string defining full output netcdf file path and name. missing_values :: boolean where True indicates the data has missing values that should be ignored. Missing values must be represented by NaNs. ''' frequency_str = time_var_name + '.' + output_frequency print('Calculating climatological mean') if missing_values: ds_mean = xr.Dataset() for varname, da in ds.data_vars.items(): mask = xr.where(uf.isnan(da), 0, 1) data = da.groupby(frequency_str).sum(dim=time_dim_name) N = mask.groupby(frequency_str).sum(dim=time_dim_name) ds_mean[varname] = data / N else: if monthly_weights: month_length = ds[time_var_name].dt.days_in_month grouped = month_length.groupby(frequency_str) else: ds['clim_mean_ones_tmp'] = (time_dim_name, np.ones( ds[time_var_name].shape[0])) grouped = ds['clim_mean_ones_tmp'].groupby(frequency_str) weights = grouped / grouped.sum() ds_mean = (ds * weights).groupby(frequency_str).sum(dim=time_dim_name) if not monthly_weights: ds = ds.drop_vars('clim_mean_ones_tmp') if fn_out is not None: print('Saving to file. May take some time..') with ProgressBar(): ds_mean.to_netcdf(fn_out) return ds_mean return
def Movie(da, odir, varname=None, framedim='time', moviename='movie', clim=None, cmap=None, bgcolor=np.array([1, 1, 1]) * 0.3, framewidth=1280, frameheight=720, dpi=100, lon=None, lat=None, dask=True, delete=True, ffmpeg=True, plot_style='simple', norm=mpl.colors.Normalize(), progbar=False): # Set defaults: if not ffmpeg and delete: raise RuntimeError('raw picture deletion makes only \ sense if ffmpeg conversion is enabled') if not isinstance(da, xr.DataArray): raise RuntimeError('input has to be an xarray DataStructure, instead\ is ' + str(type(da))) if not os.path.exists(odir): os.makedirs(odir) # Infer defaults from data if clim is None: print('clim will be inferred from data, this can take very long...') clim = [da.min(), da.max()] if cmap is None: cmap = plt.cm.viridis if plot_style in ['map']: if None in [lon, lat]: raise RuntimeError('map plotting requires lon and lat') else: lons = np.array(da[lon].data) lats = np.array(da[lat].data) if len(lons.shape) != 2: lons, lats = np.meshgrid(lons, lats) time = np.array(da['time'].data) else: lons = None lats = None time = None # Annnd here we go print('+++ Execute plot function +++') if dask: data = da.data frame_axis = da.get_axis_num(framedim) drop_axis = [da.get_axis_num(a) for a in da.dims if not a == framedim] chunks = list(data.shape) chunks[frame_axis] = 1 data = data.rechunk(chunks) if progbar: pbar = ProgressBar() pbar.register() data.map_blocks(FramePrint, chunks=[1], drop_axis=drop_axis, dtype=np.float64, dask=dask, frame_axis=frame_axis, odir=odir, cmap=cmap, clim=clim, framewidth=framewidth, frameheight=frameheight, bgcolor=bgcolor, plot_style=plot_style, lons=lons, lats=lats, time=time, norm=norm, dpi=dpi).compute(get=get) if progbar: pbar.unregister() # The .compute(get=get) line is some dask 'magic': it parallelizes the # print function with processes and not threads,which is a lot faster # for custom functions apparently! else: # do it with a simple for loop...can this really be quicker? print('This is slow! Do it in dask!') for ii in range(0, len(da.time)): start_time = time.time() da_slice = da[{framedim: ii}] # fig,ax,h = FramePrint(da_slice, FramePrint(da_slice, frame=ii, odir=odir, cmap=cmap, clim=clim, framewidth=framewidth, frameheight=dpi, bgcolor=bgcolor, plot_style=plot_style, lons=lons, lats=lats, norm=norm, dpi=dpi) if ii % 100 == 0: remaining_time = (len(da.time) - ii) * \ (time.time() - start_time) / 60 print('FRAME---%04d---' % ii) print('Estimated time left : %d minutes' % remaining_time) query = 'ffmpeg -y -i "frame_%05d.png" -c:v libx264 -preset veryslow \ -crf 6 -pix_fmt yuv420p \ -framerate 10 \ "' + moviename + '.mp4"' with cd(odir): if ffmpeg: print('+++ Convert frames to video +++') excode = os.system(query) if excode == 0 and delete: os.system('rm *.png')
def calc_simulated_energy(wind_speed, turbines, power_curve=None, sum_along='turbines', capacity_scaling=True, only_built_turbines=True): """Estimate generated energy using wind data and turbine data. Parameters ---------- wind_speed : xr.DataArray see calc_wind_speed_at_turbines() turbines : xr.DataSet see load_turbines() power_curve : callable a function mapping wind speed to power sum_along : str sum along turbines or time or emtpy string capacity_scaling : bool scale power curve to capacity for each turbine (if available) only_built_turbines : bool calculate energy only for time stamps where commission year is older Returns ------- simulated_energy_gwh : xr.DataArray Simulated energy per month [GWh], dims = (time, turbines) FIXME this modifies the input wind_speed variable! Very dangerous but unclear if solvable without too much memory consumption (via copying) """ if power_curve is None: power_curve = ge15_77.power_curve # this outputs a deprecation warning, see https://github.com/pydata/xarray/issues/2928 # TODO probably not the best idea to have this here, since it modifies gloabl behavior at # runtime, but where else to put it? pytest ignores warnings.catch_warnings()... warnings.filterwarnings('ignore', 'The da.atop function has moved to da.blockwise') # TODO this is a bit scary, when does parallelized not work? Which dtype? simulated_energy = xr.apply_ufunc(power_curve, wind_speed, dask='parallelized', output_dtypes=[np.float64]) simulated_energy = simulated_energy.assign_coords(turbines=turbines.turbines) if only_built_turbines: # TODO all turbines where year = NaN will be removed that way... :-/ # this is the beginning of the year the turbine has been commissioned building_dates = turbines.p_year.astype(int).astype(str).astype(np.datetime64) nanosecs_of_year = (simulated_energy.time - building_dates).astype(np.float) proportion_of_year = nanosecs_of_year / (365.25 * 24 * 60 * 60 * 1e9) # comparing objects with dim "time" and dim "turbines" results in (time, turbines) building_this_year = simulated_energy.time.dt.year == turbines.p_year simulated_energy = simulated_energy.where(~building_this_year, simulated_energy * proportion_of_year) already_built = simulated_energy.time.dt.year >= turbines.p_year simulated_energy = simulated_energy.where(already_built, 0) # Uargh... there is a weired memory leak somewhere, this seems to help a bit at least... :-/ del nanosecs_of_year del proportion_of_year del building_this_year del building_dates del already_built if capacity_scaling: # FIXME this should use turbine_model.capacity_mw not 1500! simulated_energy *= (turbines.t_cap / 1500.).fillna(1.) # inspired by: # http://xarray.pydata.org/en/stable/examples/weather-data.html#monthly-averaging simulated_energy = simulated_energy.sortby('time') * 1e-6 if sum_along: simulated_energy = simulated_energy.sum(dim=sum_along) if sum_along == 'turbines': simulated_energy = simulated_energy.resample(time='1MS').sum() # Does not work for multiple years: # simulated_energy = simulated_energy.sum(dim='turbines').groupby('time.month').sum() * 1e-6 with ProgressBar(): simulated_energy_gwh = simulated_energy.compute() if sum_along == 'turbines': simulated_energy_gwh.name = "Simulated energy per month [GWh]" elif sum_along == 'time': simulated_energy_gwh.name = "Simulated energy" # TODO unit depends on time range? return simulated_energy_gwh
def dask_linear_operator(self): self.nC = self.modelMap.shape[0] n_data_comp = len(self.survey.components) components = np.array(list(self.survey.components.keys())) active_components = np.hstack( [np.c_[values] for values in self.survey.components.values()] ).tolist() row = delayed(self.evaluate_integral, pure=True) rows = [ array.from_delayed( row(receiver_location, components[component]), dtype=np.float32, shape=(n_data_comp, self.nC), ) for receiver_location, component in zip( self.survey.receiver_locations.tolist(), active_components ) ] stack = array.vstack(rows) # Chunking options if self.chunk_format == "row" or self.store_sensitivities == "forward_only": config.set({"array.chunk-size": f"{self.max_chunk_size}MiB"}) # Autochunking by rows is faster and more memory efficient for # very large problems sensitivty and forward calculations stack = stack.rechunk({0: "auto", 1: -1}) elif self.chunk_format == "equal": # Manual chunks for equal number of blocks along rows and columns. # Optimal for Jvec and Jtvec operations row_chunk, col_chunk = compute_chunk_sizes(*stack.shape, self.max_chunk_size) stack = stack.rechunk((row_chunk, col_chunk)) else: # Auto chunking by columns is faster for Inversions config.set({"array.chunk-size": f"{self.max_chunk_size}MiB"}) stack = stack.rechunk({0: -1, 1: "auto"}) if self.store_sensitivities == "disk": sens_name = self.sensitivity_path + "sensitivity.zarr" if os.path.exists(sens_name): kernel = array.from_zarr(sens_name) if np.all( np.r_[ np.any(np.r_[kernel.chunks[0]] == stack.chunks[0]), np.any(np.r_[kernel.chunks[1]] == stack.chunks[1]), np.r_[kernel.shape] == np.r_[stack.shape], ] ): # Check that loaded kernel matches supplied data and mesh print("Zarr file detected with same shape and chunksize ... re-loading") return kernel else: print("Writing Zarr file to disk") with ProgressBar(): print("Saving kernel to zarr: " + sens_name) kernel = array.to_zarr( stack, sens_name, compute=True, return_stored=True, overwrite=True ) elif self.store_sensitivities == "forward_only": with ProgressBar(): print("Forward calculation: ") pred = (stack @ self.model).compute() return pred else: print(stack.chunks) with ProgressBar(): print("Computing sensitivities to local ram") kernel = array.asarray(stack.compute()) return kernel
import argparse import json import os import hashlib import pathlib from tabulate import tabulate import format.peek as sspk import format.split_column as sssp import format.tab_man_gui as tmg import dask.dataframe as dd from dask.diagnostics import ProgressBar import pandas as pd from format.utils import header_mapper pbar = ProgressBar() pbar.register() class Table(): def __init__(self, file, outfile_prefix, field_sep, remove_starting): self.file = file self.outfile_prefix = outfile_prefix self.field_sep = field_sep self.ignore_pattern = remove_starting self.field_names = [] def get_extension(self): self.file_extension = "".join(pathlib.Path(self.file).suffixes) def get_filename(self): self.get_extension()
def new(ms, sky_model, gains, **kwargs): """Generate model visibilties per source (as direction axis) for stokes I and Q and generate relevant visibilities.""" # Options to attributed dictionary if kwargs["yaml"] is not None: options = ocf.load(kwargs["yaml"]) else: options = ocf.create(kwargs) # Set to struct ocf.set_struct(options, True) # Change path to sky model if chosen try: sky_model = sky_models[sky_model.lower()] except: # Own sky model reference pass # Set thread count to cpu count if options.ncpu: from multiprocessing.pool import ThreadPool import dask dask.config.set(pool=ThreadPool(options.ncpu)) else: import multiprocessing options.ncpu = multiprocessing.cpu_count() # Load gains to corrupt with with open(gains, "rb") as file: jones = np.load(file) # Load dimensions n_time, n_ant, n_chan, n_dir, n_corr = jones.shape n_row = n_time * (n_ant * (n_ant - 1) // 2) # Load ms MS = xds_from_ms(ms)[0] # Get time-bin indices and counts row_chunks, tbin_indices, tbin_counts = chunkify_rows( MS.TIME, options.utime) # Close and reopen with chunked rows MS.close() MS = xds_from_ms(ms, chunks={"row": row_chunks})[0] # Get antenna arrays (dask ignored for now) ant1 = MS.ANTENNA1.data ant2 = MS.ANTENNA2.data # Adjust UVW based on phase-convention if options.phase_convention.upper() == 'CASA': uvw = -MS.UVW.data.astype(np.float64) elif options.phase_convention.upper() == 'CODEX': uvw = MS.UVW.data.astype(np.float64) else: raise ValueError("Unknown sign convention for phase.") # MS dimensions dims = ocf.create(dict(MS.sizes)) # Close MS MS.close() # Build source model from lsm lsm = Tigger.load(sky_model) # Check if dimensions match jones assert n_time * (n_ant * (n_ant - 1) // 2) == dims.row assert n_time == len(tbin_indices) assert n_ant == np.max((np.max(ant1), np.max(ant2))) + 1 assert n_chan == dims.chan assert n_corr == dims.corr # If gains are DIE if options.die: assert n_dir == 1 n_dir = len(lsm.sources) else: assert n_dir == len(lsm.sources) # Get phase direction radec0_table = xds_from_table(ms + '::FIELD')[0] radec0 = radec0_table.PHASE_DIR.data.squeeze().compute() radec0_table.close() # Get frequency column freq_table = xds_from_table(ms + '::SPECTRAL_WINDOW')[0] freq = freq_table.CHAN_FREQ.data.astype(np.float64)[0] freq_table.close() # Get feed orientation feed_table = xds_from_table(ms + '::FEED')[0] feeds = feed_table.POLARIZATION_TYPE.data[0].compute() # Create initial model array model = np.zeros((n_dir, n_chan, n_corr), dtype=np.float64) # Create initial coordinate array and source names lm = np.zeros((n_dir, 2), dtype=np.float64) source_names = [] # Cycle coordinates creating a source with flux print("==> Building model visibilities") for d, source in enumerate(lsm.sources): # Extract name source_names.append(source.name) # Extract position radec_s = np.array([[source.pos.ra, source.pos.dec]]) lm[d] = radec_to_lm(radec_s, radec0) # Get flux - Stokes I if source.flux.I: I0 = source.flux.I # Get spectrum (only spi currently supported) tmp_spec = source.spectrum spi = [tmp_spec.spi if tmp_spec is not None else 0.0] ref_freq = [tmp_spec.freq0 if tmp_spec is not None else 1.0] # Generate model flux model[d, :, 0] = I0 * (freq / ref_freq)**spi # Get flux - Stokes Q if source.flux.Q: Q0 = source.flux.Q # Get spectrum tmp_spec = source.spectrum spi = [tmp_spec.spi if tmp_spec is not None else 0.0] ref_freq = [tmp_spec.freq0 if tmp_spec is not None else 1.0] # Generate model flux model[d, :, 1] = Q0 * (freq / ref_freq)**spi # Get flux - Stokes U if source.flux.U: U0 = source.flux.U # Get spectrum tmp_spec = source.spectrum spi = [tmp_spec.spi if tmp_spec is not None else 0.0] ref_freq = [tmp_spec.freq0 if tmp_spec is not None else 1.0] # Generate model flux model[d, :, 2] = U0 * (freq / ref_freq)**spi # Get flux - Stokes V if source.flux.V: V0 = source.flux.V # Get spectrum tmp_spec = source.spectrum spi = [tmp_spec.spi if tmp_spec is not None else 0.0] ref_freq = [tmp_spec.freq0 if tmp_spec is not None else 1.0] # Generate model flux model[d, :, 3] = V0 * (freq / ref_freq)**spi # Close sky-model del lsm # Build dask graph tbin_indices = da.from_array(tbin_indices, chunks=(options.utime)) tbin_counts = da.from_array(tbin_counts, chunks=(options.utime)) lm = da.from_array(lm, chunks=lm.shape) model = da.from_array(model, chunks=model.shape) jones = da.from_array(jones, chunks=(options.utime, ) + jones.shape[1::]) # Apply image to visibility for each source sources = [] for s in range(n_dir): source_vis = im_to_vis(model[s].reshape((1, n_chan, n_corr)), uvw, lm[s].reshape((1, 2)), freq, dtype=np.complex64, convention='fourier') sources.append(source_vis) model_vis = da.stack(sources, axis=2) # Sum over direction? if options.die: model_vis = da.sum(model_vis, axis=2, keepdims=True) n_dir = 1 source_names = [options.mname] # Select schema based on feed orientation if (feeds == ["X", "Y"]).all(): out_schema = [["XX", "XY"], ["YX", "YY"]] elif (feeds == ["R", "L"]).all(): out_schema = [['RR', 'RL'], ['LR', 'LL']] else: raise ValueError("Unknown feed orientation implementation.") # Convert Stokes to Correlations in_schema = ['I', 'Q', 'U', 'V'] model_vis = convert(model_vis, in_schema, out_schema).reshape( (n_row, n_chan, n_dir, n_corr)) # Apply gains to model_vis print("==> Corrupting visibilities") data = corrupt_vis(tbin_indices, tbin_counts, ant1, ant2, jones, model_vis) # Reopen MS MS = xds_from_ms(ms, chunks={"row": row_chunks})[0] # Assign model visibilities out_names = [] for d in range(n_dir): MS = MS.assign( **{ source_names[d]: (("row", "chan", "corr"), model_vis[:, :, d].astype(np.complex64)) }) out_names += [source_names[d]] # Assign noise free visibilities to 'CLEAN_DATA' MS = MS.assign( **{ 'CLEAN_' + options.dname: (("row", "chan", "corr"), data.astype(np.complex64)) }) out_names += ['CLEAN_' + options.dname] # Get noise realisation if options.std > 0.0: # Noise matrix print(f"==> Applying noise (std={options.std}) to visibilities") noise = [] for i in range(2): real = da.random.normal(loc=0.0, scale=options.std, size=(n_row, n_chan), chunks=(row_chunks, n_chan)) imag = 1.0j * (da.random.normal(loc=0.0, scale=options.std, size=(n_row, n_chan), chunks=(row_chunks, n_chan))) noise.append(real + imag) # Zero matrix for off-diagonals zero = da.zeros((n_row, n_chan), chunks=(row_chunks, n_chan)) noise.insert(1, zero) noise.insert(2, zero) # NP to Dask noise = da.stack(noise, axis=2).rechunk((row_chunks, n_chan, n_corr)) # Assign noise to 'NOISE' MS = MS.assign( **{'NOISE': (("row", "chan", "corr"), noise.astype(np.complex64))}) out_names += ['NOISE'] # Add noise to data and assign to 'DATA' noisy_data = data + noise MS = MS.assign( **{ options.dname: (("row", "chan", "corr"), noisy_data.astype(np.complex64)) }) out_names += [options.dname] # Create a write to the table write = xds_to_table(MS, ms, out_names) # Submit all graph computations in parallel print(f"==> Executing `dask-ms` write to `{ms}` for the following columns: "\ + f"{', '.join(out_names)}") with ProgressBar(): write.compute() print(f"==> Completed.")
df_pros = pd.read_csv(filepath, header=0, sep='|', quoting=3, dtype='str', encoding='utf-8', na_values=([' ',''])).fillna('') #annonces = dd.read_csv(r'D:\25. Requests\IMMO_FR processing script\ANNONCES_2020_08.csv', header=0, sep='|', quoting=3, dtype='object', encoding='utf-8', na_values=([' ',''])) #df_pros = pd.read_csv(r'D:\25. Requests\IMMO_FR processing script\PRO_2020_08.csv', header=0, sep='|', quoting=3, dtype='str', encoding='utf-8', na_values=([' ',''])).fillna('') start = timer() print("Start exporting") #EXPORT PERCENTAGE OF NULL/MISSING VALUES PER WEBSITES print("Export: emptiness_per_website.xlsx") column_list = annonces.columns.tolist() websites = annonces['SITE_ANNONCE'].unique().compute().tolist() print("There are {} websites".format(len(websites))) pct_missing_per_website = pd.DataFrame() with ProgressBar(): for website in websites: print(website) temp_df = annonces[annonces['SITE_ANNONCE']==website].compute() temp_null_count = temp_df.isnull().sum() temp_site_count = len(temp_df.index) pct_missing_per_website[website] = round(temp_null_count/temp_site_count*100,2) pct_missing_per_website.to_excel(os.path.join(save_path ,'emptiness_per_website.xlsx')) #EXPORT PERCENTAGE OF TOTAL NULL/MISSING VALUES print("Export: emptiness_total.xlsx") with ProgressBar(): null_count_total = annonces.isnull().sum().compute() total_length = len(annonces.index) null_percent_total = round(null_count_total/len(annonces.index),2) total_missing = pd.concat([null_count_total, null_percent_total], axis = 1)
def main(argv=sys.argv[1:]): global LOG import satpy from satpy import Scene from satpy.writers import compute_writer_results from dask.diagnostics import ProgressBar from polar2grid.core.script_utils import (setup_logging, rename_log_file, create_exc_handler) import argparse dist = pkg_resources.get_distribution('polar2grid') if dist_is_editable(dist): p2g_etc = os.path.join(dist.module_path, 'etc') else: p2g_etc = os.path.join(sys.prefix, 'etc', 'polar2grid') config_path = satpy.config.get('config_path') if p2g_etc not in config_path: satpy.config.set(config_path=config_path + [p2g_etc]) USE_POLAR2GRID_DEFAULTS = bool( int(os.environ.setdefault("USE_POLAR2GRID_DEFAULTS", "1"))) prog = os.getenv('PROG_NAME', sys.argv[0]) # "usage: " will be printed at the top of this: usage = """ %(prog)s -h see available products: %(prog)s -r <reader> -w <writer> --list-products -f file1 [file2 ...] basic processing: %(prog)s -r <reader> -w <writer> [options] -f file1 [file2 ...] basic processing with limited products: %(prog)s -r <reader> -w <writer> [options] -p prod1 prod2 -f file1 [file2 ...] """ parser = argparse.ArgumentParser( prog=prog, usage=usage, fromfile_prefix_chars="@", description="Load, composite, resample, and save datasets.") parser.add_argument( '-v', '--verbose', dest='verbosity', action="count", default=0, help='each occurrence increases verbosity 1 level through ' 'ERROR-WARNING-INFO-DEBUG (default INFO)') parser.add_argument('-l', '--log', dest="log_fn", default=None, help="specify the log filename") parser.add_argument( '--progress', action='store_true', help="show processing progress bar (not recommended for logged output)" ) parser.add_argument( '--num-workers', type=int, default=os.getenv('DASK_NUM_WORKERS', 4), help="specify number of worker threads to use (default: 4)") parser.add_argument( '--match-resolution', dest='preserve_resolution', action='store_false', help="When using the 'native' resampler for composites, don't save data " "at its native resolution, use the resolution used to create the " "composite.") parser.add_argument("--list-products", dest="list_products", action="store_true", help="List available reader products and exit") reader_group = add_scene_argument_groups( parser, is_polar2grid=USE_POLAR2GRID_DEFAULTS)[0] resampling_group = add_resample_argument_groups( parser, is_polar2grid=USE_POLAR2GRID_DEFAULTS)[0] writer_group = add_writer_argument_groups(parser)[0] subgroups = [reader_group, resampling_group, writer_group] argv_without_help = [x for x in argv if x not in ["-h", "--help"]] _retitle_optional_arguments(parser) args, remaining_args = parser.parse_known_args(argv_without_help) os.environ['DASK_NUM_WORKERS'] = str(args.num_workers) # get the logger if we know the readers and writers that will be used if args.readers is not None and args.writers is not None: glue_name = args.readers[0] + "_" + "-".join(args.writers or []) LOG = logging.getLogger(glue_name) # add writer arguments for writer in (args.writers or []): parser_func = WRITER_PARSER_FUNCTIONS.get(writer) if parser_func is None: continue subgroups += parser_func(parser) args = parser.parse_args(argv) if args.readers is None: parser.print_usage() parser.exit( 1, "\nERROR: Reader must be provided (-r flag).\n" "Supported readers:\n\t{}\n".format('\n\t'.join( ['abi_l1b', 'ahi_hsd', 'hrit_ahi']))) elif len(args.readers) > 1: parser.print_usage() parser.exit( 1, "\nMultiple readers is not currently supported. Got:\n\t" "{}\n".format('\n\t'.join(args.readers))) return -1 if args.writers is None: parser.print_usage() parser.exit( 1, "\nERROR: Writer must be provided (-w flag) with one or more writer.\n" "Supported writers:\n\t{}\n".format('\n\t'.join(['geotiff']))) def _args_to_dict(group_actions, exclude=None): if exclude is None: exclude = [] return { ga.dest: getattr(args, ga.dest) for ga in group_actions if hasattr(args, ga.dest) and ga.dest not in exclude } reader_args = _args_to_dict(reader_group._group_actions) reader_names = reader_args.pop('readers') scene_creation = { 'filenames': reader_args.pop('filenames'), 'reader': reader_names[0], } load_args = { 'products': reader_args.pop('products'), } # anything left in 'reader_args' is a reader-specific kwarg resample_args = _args_to_dict(resampling_group._group_actions) writer_args = _args_to_dict(writer_group._group_actions) # writer_args = {} subgroup_idx = 3 for idx, writer in enumerate(writer_args['writers']): sgrp1, sgrp2 = subgroups[subgroup_idx + idx * 2:subgroup_idx + 2 + idx * 2] wargs = _args_to_dict(sgrp1._group_actions) if sgrp2 is not None: wargs.update(_args_to_dict(sgrp2._group_actions)) writer_args[writer] = wargs # get default output filename if 'filename' in wargs and wargs['filename'] is None: wargs['filename'] = get_default_output_filename( args.readers[0], writer) if not args.filenames: parser.print_usage() parser.exit(1, "\nERROR: No data files provided (-f flag)\n") # Prepare logging rename_log = False if args.log_fn is None: rename_log = True args.log_fn = glue_name + "_fail.log" levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] setup_logging(console_level=levels[min(3, args.verbosity)], log_filename=args.log_fn) logging.getLogger('rasterio').setLevel(levels[min(2, args.verbosity)]) sys.excepthook = create_exc_handler(LOG.name) if levels[min(3, args.verbosity)] > logging.DEBUG: import warnings warnings.filterwarnings("ignore") LOG.debug("Starting script with arguments: %s", " ".join(sys.argv)) # Set up dask and the number of workers if args.num_workers: dask.config.set(num_workers=args.num_workers) # Parse provided files and search for files if provided directories scene_creation['filenames'] = get_input_files(scene_creation['filenames']) # Create a Scene, analyze the provided files LOG.info("Sorting and reading input files...") try: scn = Scene(**scene_creation) except ValueError as e: LOG.error( "{} | Enable debug message (-vvv) or see log file for details.". format(str(e))) LOG.debug("Further error information: ", exc_info=True) return -1 except OSError: LOG.error( "Could not open files. Enable debug message (-vvv) or see log file for details." ) LOG.debug("Further error information: ", exc_info=True) return -1 if args.list_products: print("\n".join(sorted(scn.available_dataset_names(composites=True)))) return 0 # Rename the log file if rename_log: rename_log_file(glue_name + scn.attrs['start_time'].strftime("_%Y%m%d_%H%M%S.log")) # Load the actual data arrays and metadata (lazy loaded as dask arrays) LOG.info("Loading product metadata from files...") load_args['products'] = _apply_default_products_and_aliases( scn, scene_creation['reader'], load_args['products']) if not load_args['products']: return -1 scn.load(load_args['products']) ll_bbox = resample_args.pop('ll_bbox') if ll_bbox: scn = scn.crop(ll_bbox=ll_bbox) scn = filter_scene( scn, reader_names, sza_threshold=reader_args['sza_threshold'], day_fraction=reader_args['filter_day_products'], night_fraction=reader_args['filter_night_products'], ) if scn is None: LOG.info("No remaining products after filtering.") return 0 to_save = [] areas_to_resample = resample_args.pop("grids") if 'ewa_persist' in resample_args: resample_args['persist'] = resample_args.pop('ewa_persist') scenes_to_save = resample_scene( scn, areas_to_resample, preserve_resolution=args.preserve_resolution, is_polar2grid=USE_POLAR2GRID_DEFAULTS, **resample_args) for scene_to_save, products_to_save in scenes_to_save: overwrite_platform_name_with_aliases(scene_to_save) to_save = write_scene(scene_to_save, writer_args['writers'], writer_args, products_to_save, to_save=to_save) if args.progress: pbar = ProgressBar() pbar.register() LOG.info("Computing products and saving data to writers...") compute_writer_results(to_save) LOG.info("SUCCESS") return 0