def doubleMAD(vector, threshold=3.5): ''' Returns a boolean array comparing the Modified Z-Score (MZS) to the given threshold factor. Only works with 1D arrays (vectors) but can be iterated over for multiple distributions. A return of True implies an outlying data point. ''' if vector.ndim is not 1: raise DimensionError("Input must be a 1D vector.") # Calculate the overall median (allows for masked vectors) m = np.ma.median(vector) # Calculate the absolute deviation from the true median absDev = np.abs(vector - m) # Calculate the median absolute deviation for both the left and right splits leftMAD = np.ma.median(absDev[vector <= m]) rightMAD = np.ma.median(absDev[vector >= m]) vectorMAD = leftMAD * np.ones(len(vector)) vectorMAD[vector > m] = rightMAD # Calculate the modified Z score MZS = 0.6745 * absDev / vectorMAD # If the value of the vector equals the median, set the MZS to 0 MZS[vector == m] = 0 # Return true if the MZS is greater than the threshold return MZS > threshold
def greyscale(array, ax=None, cbar=False, mask=None, show=True, filename=None, setnan=0.0, **kwargs): """Basic imshow of array""" if array.ndim is 2: if mask is not None: imshow(np.ma.masked_array(array, mask=mask), ax=ax, **kwargs) else: imshow(array, ax=ax, **kwargs) if cbar: plt.colorbar() if filename is not None: plt.savefig(filename) if show: plt.show() else: plt.close() else: raise DimensionError( "Invalid dimensions. Required: 2. (Actual: {})".format(array.ndim)) return ax
def rmsMatrix2D(array, mask=None, nanmask=False): ''' Creates an array of RMS values given a 3D array of data with the first two dimensions forming the output matrix and the 3rd dimension containing the independent axis. ''' if array.ndim is not 3: raise DimensionError("Input array must be 3 dimensional.") width, height, depth = array.shape if mask is not None: if not isinstance(mask, np.ndarray): raise TypeError("Mask must be an array.") elif mask.ndim is not 1: raise DimensionError("Mask must be an array of dimension 1.") elif depth != len(mask): raise ValueError( "Independent dimension and mask must have same length.") # Initialize RMS table of zeros r = np.zeros((width, height), dtype=float) # Initialize the mask array along the 3rd axis if mask is None: m = np.zeros(depth, dtype=int) else: m = mask # Loop over the other two dimensions for i in np.arange(width): for j in np.arange(height): # Calculate the RMS of the array everywhere the mask is 0 r[i][j] = rootMeanSquare(array[i][j][m == 0]) if all(amp == 0 for amp in array[i][j]): r[i][j] = np.nan # Mask the nan values in the array for potential plotting if needed if nanmask: r = np.ma.array(r, mask=np.isnan(r)) # Returns the RMS matrix return r
def get_dataset_from_name(ds_name: str, ds_path: str = None, split: str = 'train', seed=42): if ds_path is not None: if not os.path.exists(ds_path): DataGenerator.logger.error(f'{ds_path} does not exist') raise NameError(f'Directory {ds_path} does not exist') if ds_name not in AVAILABLE_DATASET: DataGenerator.logger.info( f'Dataset name {ds_name} is not supported. Supported dataset: {list(AVAILABLE_DATASET)}' ) raise NameError( f'Dataset name {ds_name} is not supported. Supported dataset: {list(AVAILABLE_DATASET)}' ) if ds_name == 'imagenet': return DataGenerator.get_imagenet(ds_path, split) else: ds, info_ds = tfds.load( ds_name, data_dir=ds_path, split=split, # If true, returns `(img, label)` instead of dict(image=, ...) as_supervised=True, with_info=True) if not isinstance(ds, tf.data.Dataset): raise UnsupportedFormat( f'Type of ds is not the one expected (tf.data.Dataset) {type(ds)}' ) num_examples = info_ds.splits[split].num_examples iterator = iter(ds) first_elem = iterator.get_next() if len(first_elem[0].shape) != 3: raise DimensionError( f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}' ) img_shape = first_elem[0].shape num_classes = -1 if len(info_ds.supervised_keys) == 2: label = info_ds.supervised_keys[1] num_classes = info_ds.features[label].num_classes else: raise UnsupportedFormat( f'This function only handle datasets like (features, labels) not {info_ds.supervised_keys}' ) return ds, img_shape, num_examples, num_classes
def waterfall(array, ax=None, offset=None, border=0, labels=True, bins=None, show=True, **kwargs): """ Waterfall plot of an array. Requires an array of 2 dimensions. """ if array.ndim is 2: if offset is None: offset = np.max(np.average(array, axis=0)) fig = plt.figure(figsize=(6, 6)) bgcolor = 'w' ax = fig.add_subplot(111, facecolor=bgcolor, **kwargs) color = 'k' if bins is None: bins = np.arange(array.shape[1]) x_min = 0 x_max = len(bins) - 1 y_min = 0 - offset y_max = (1 + len(array)) * offset x_low = x_min - (x_max - x_min) * border x_high = (x_max - x_min) * border + x_max y_low = y_min - (y_max - y_min) * border y_high = (y_max - y_min) * border + y_max for i in np.arange(len(array)): ax.plot(array[i][bins] + offset * i, color) ax.set_xlim(x_low, x_high) ax.set_ylim(y_low, y_high) if not labels: ax.set_xticklabels([]) ax.set_yticklabels([]) if show: plt.show() else: plt.close() else: raise DimensionError( "Invalid dimensions. Required: 2. (Actual: {})".format(array.ndim)) return ax
def get_dataset_from_directory(ds_path: str, split: str, seed=42): if not os.path.exists(ds_path): DataGenerator.logger.error(f'{ds_path} does not exist') raise NameError(f'Directory {ds_path} does not exist') builder = tfds.folder_dataset.ImageFolder(ds_path) info_ds = builder.info ds = builder.as_dataset(as_supervised=True, split=split) if not isinstance(ds, tf.data.Dataset): raise UnsupportedFormat( f'Type of ds is not the one expected (tf.data.Dataset) {type(ds)}' ) num_examples = DataGenerator.evaluate_size_dataset(ds) iterator = iter(ds) first_elem = iterator.get_next() if len(first_elem[0].shape) != 3: raise DimensionError( f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}' ) img_shape = first_elem[0].shape num_classes = -1 if len(info_ds.supervised_keys) == 2: label = info_ds.supervised_keys[1] num_classes = info_ds.features[label].num_classes else: raise UnsupportedFormat( f'This function only handle datasets like (features, labels) not {info_ds.supervised_keys}' ) print( f'img shape {img_shape} number of examples {num_examples} number of classes {num_classes}' ) if popdist.getNumInstances() > 1: ds = ds.shard(num_shards=popdist.getNumInstances(), index=popdist.getInstanceIndex()) return ds, img_shape, num_examples, num_classes
def get_1D_OPW_mask(vector, **kwargs): if vector.ndim is not 1: raise DimensionError( "Input data must be 1 dimensional to create an OPW mask.") mask = [] sp_dat = SinglePulse(vector, **kwargs) while len(mask) < len(vector): if len(mask) in sp_dat.opw: mask.append(False) else: mask.append(True) mask = np.asarray(mask) return mask
def histogram_and_curves(array, mean=0.0, std_dev=1.0, bins=None, x_lims=None, y_lims=None, x_axis='X', y_axis='Y', title='Title', show=True, filename=None, curve_list=None, labels=None, **kwargs): """ Histogram plotter for 1 or 2D data. Can compare PDFs in 1D Parameters ---------- array : np.ndarray 1 or 2D data array mean : int, float, [int, int], [float, float] Calculated mean of data std_dev : int, float, [int, int], [float, float] Calculated standard deviation of data bins : int Number of bins in histogram x_lims, y_lims : [int, int], [float, float] x and y limits of the plot x_axis, y_axis, title : str x, y and title names show : bool Show plots (default is False) filename : str Name of the file to save to (if None, the plot will not be saved) curve_list : list of callables List of curves to fit to the data as defined by the user labels : [str, str, ...] List of legend labels for the curve list **kwargs kwargs passed to np.hist() Returns ------- matplotlib Axes : ax """ color = 'k' bgcolor = 'w' style = 'stepfilled' # Set up figure and axes fig = plt.figure(figsize=(6, 6)) ax = fig.add_subplot(111, facecolor=bgcolor) xText = ax.set_xlabel(x_axis) yText = ax.set_ylabel(y_axis) title = ax.set_title(title) # Convert any lists or dicts to numpy arrays if not isinstance(array, np.ndarray): array = np.array(array) if array.ndim is 1: # Number of histogram bins to use (if not supplied by user) if bins is None: step = (math.ceil(np.amax(array)) - math.floor(np.amin(array))) / ( 20 * abs(math.ceil(np.amax(array)))) bins = np.arange(math.floor(np.amin(array)), math.ceil(np.amax(array)), step) if not x_lims: x_min = mean - (4 * std_dev) x_max = mean + (4 * std_dev) else: x_min, x_max = x_lims # Linespace for curve plotting t = np.arange(x_min, x_max, 0.01) # Plot the 1D histogram n, bins, patches = ax.hist(array, bins=bins, density=True, color=color, histtype=style, linewidth=2, **kwargs) xlim = ax.set_xlim(x_min, x_max) ylim = ax.set_ylim(0, 1.2 * np.amax(n)) # Plot distribution curves if curve_list: for i, curve in enumerate(curve_list): # Selects color from list color_index = i % len(color_list) color = color_list[color_index] # Find the number of fitting arguments in the desired curve p0_len = u.get_unique_fitting_parameter_length(curve) if not p0_len: p0 = [mean, std_dev] else: p0 = np.ones(p0_len) # Try fitting and plotting. If fit doesn't work, just plot the histogram try: try: params = opt.curve_fit(curve, bins[1:], n, p0=p0) except RuntimeError: continue line, = ax.plot(t, curve(t, *params[0]), color=color, linewidth=2) except TypeError: line, = ax.plot(t, curve(t), color=color, linewidth=2) if labels: leg_labs = np.zeros(len(labels)) if len(labels) == len(curve_list): leg_labs[i] = line.set_label(labels[i]) ax.legend() plt.grid(True) # Save file if filename is not None: plt.savefig(filename) if show: plt.show() elif array.ndim is 2 and array.shape[0] is 2: # Basically determines whether mean given was [float, float] or float if not x_lims: try: x_min = mean[0] - (4 * std_dev[0]) x_max = mean[0] + (4 * std_dev[0]) except TypeError: x_min = mean - (4 * std_dev) x_max = mean + (4 * std_dev) else: x_min, x_max = x_lims if not y_lims: try: y_min = mean[1] - (4 * std_dev[1]) y_max = mean[1] + (4 * std_dev[1]) except TypeError: y_min = x_min y_max = x_max else: y_min, y_max = y_lims # Initialize bin size if not parsed in if bins is None: step = (math.ceil(np.amax(array)) - math.floor(np.amin(array))) / ( 100 * abs(math.ceil(np.amax(array)))) bins = [ np.arange(math.floor(np.amin(array[0])), math.ceil(np.amax(array[0])), step), np.arange(math.floor(np.amin(array[1])), math.ceil(np.amax(array[1])), step) ] # Plot the 2D histogram h, x_edge, y_edge, quad_mesh = ax.hist2d(array[0], array[1], bins=bins, **kwargs) xlim = ax.set_xlim(x_min, x_max) ylim = ax.set_ylim(y_min, y_max) if filename is not None: plt.savefig(filename) if show: plt.show() else: plt.close() elif array.ndim is 2: raise DimensionError( "Invalid array shape. Number of rows required: 2. (Actual: {})". format(array.shape[0])) else: raise DimensionError( "Invalid dimensions. Required: 2. (Actual: {})".format(array.ndim)) return ax
def get_imagenet(path: str, split: str, cycle_length: int = 4, block_length: int = 4): # The path is the one of dataset under TFRecord format if not os.path.exists(path): DataGenerator.logger.error(f'{path} does not exist') raise NameError(f'Directory {path} does not exist') if split == 'train': filenames = glob.glob1(path, 'train*') if len(filenames) != 1024: DataGenerator.logger.error( f'train directory should contain 1024 tf-record files but it contains {len(filenames)} instead' ) raise ValueError( f'train directory should contain 1024 files but it contains {len(filenames)} instead' ) else: filenames = glob.glob1(path, 'validation*') if len(filenames) != 128: DataGenerator.logger.error( f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead' ) raise ValueError( f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead' ) num_files = len(filenames) filenames = list( map(lambda filename: os.path.join(path, filename), filenames)) DataGenerator.logger.debug(f'filenames = {filenames}') ds = tf.data.Dataset.from_tensor_slices(filenames) if split == 'train': # Shuffle the input files ds = ds.shuffle(buffer_size=num_files) if popdist.getNumInstances() > 1: ds = ds.shard(num_shards=popdist.getNumInstances(), index=popdist.getInstanceIndex()) ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=cycle_length, block_length=block_length, num_parallel_calls=cycle_length) DataGenerator.logger.info(f'dataset = {ds}') num_examples = IMAGENET_DS_SIZE[split] DataGenerator.logger.info(f'number of examples {num_examples}') iterator = iter(ds) first_elem = iterator.get_next() feature, _ = imagenet_processing.parse_record(first_elem, True, tf.float32) if len(feature.shape) != 3: raise DimensionError( f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}' ) num_classes = 1000 ds = ds.cache() return ds, feature.shape, num_examples, num_classes
f'--logs-per-epoch should be non-negative (>=0), it is {logs_per_epoch}' ) # check for partial logs, example --logs-per-epoch 0.5 and --epochs 5 if (logs_per_epoch > 0) and (logs_per_epoch < 1) and ( num_epochs % (1 / logs_per_epoch) != 0): raise ValueError( f'It is not possible to log {1/logs_per_epoch} epochs a time for {num_epochs} epochs' ) num_pipeline_stages = len(pipeline_splits) + 1 if device_mapping: if len(device_mapping) != num_pipeline_stages: raise DimensionError( f'The number of device assignments {len(device_mapping)} is not equal to the number of pipeline splits + 1: {num_pipeline_stages}.' ) if len(set(device_mapping)) != max(device_mapping) + 1: raise DimensionError( f'The model is pipelined over {len(set(device_mapping))} different IPUs, but one or more stages are being assigned to IPU {max(device_mapping) + 1}' ) if eight_bit_transfer and not accelerator_side_preprocess: raise UnallowedConfigurationError( f'When eight bit transfer is enabled the normalisation must be done on the device. ' f'If you want to keep 8bit io, set --accelerator-side-preprocess to True.' ) if (eight_bit_transfer or accelerator_side_preprocess) and 'cifar' in model_name:
def get_best_gaussian_fit(x, y, remove_base=True, m_gauss=15, bp=15, p_wid=150, guess=[1024, 20, 6000], plot_chisq=True, save_dir=None, f_pre="", verbose=True): if remove_base: y = removeBase(y, 0.05) if not isinstance(guess, np.ndarray): guess_shape = np.array(guess).shape else: guess_shape = guess.shape n_gauss = 0 params, c = [], [] while (len(c) < m_gauss) and (n_gauss < bp): if len(guess_shape) == 1: params.extend(guess) elif len(guess_shape) == 2: try: params.extend(guess[n_gauss]) except IndexError: params.extend(guess[0]) else: raise DimensionError( "Initial guess parameters must be a Nx3 array. Current shape of array is: {}" .format(guess_shape)) n_gauss = len(params) // 3 try: fitted_params, _ = scipy.optimize.curve_fit(multi_norm, x, y, p0=params) if (n_gauss == bp) and verbose: print("Maximum number of tries reached ({})".format(n_gauss)) except RuntimeError: if len(guess_shape) == 1: fitted_params = np.append(fitted_params, guess) elif len(guess_shape) == 2: try: fitted_params = np.append(fitted_params, guess[n_gauss]) except IndexError: fitted_params = np.append(fitted_params, guess[0]) else: raise DimensionError( "You definitely shouldn't be able to see this error message." ) if verbose: print("No fit for {} gaussians".format(n_gauss)) if n_gauss == bp: print( "Maximum number of tries reached ({})".format(n_gauss)) continue m = multi_norm(x, *fitted_params) mask = get_1D_OPW_mask(m, windowsize=(len(m) - p_wid)) for i, elem in enumerate(m): if mask[i] == False: m[i] = 0 chi2, p = scipy.stats.chisquare(y[mask == 1], f_exp=m[mask == 1]) if verbose: print("Chi-sq for {} gaussians: ".format(n_gauss), chi2) c.append(chi2) if verbose: print_parameters(params, fitted_params, n_gauss) if plot_chisq: plt.plot(c[1:]) plt.show() plt.close() ind_gaussians = get_gaussians(fitted_params, n_gauss) if save_dir is not None: save_all(save_dir, f_pre, n_gauss, ind_gaussians, m) return m, c, ind_gaussians, mask
def configure_model(model: keras.Model, gradient_accumulation_count: int, pipeline_splits: list, device_mapping: list, pipeline_schedule: str, available_memory_proportion: list, optimizer_state_offloading: bool = True): if pipeline_splits: model = ModelFactory.pipeline_model(model, pipeline_splits) pipeline_schedule = next( p for p in list(pipelining_ops.PipelineSchedule) if pipeline_schedule == str(p).split(".")[-1]) if device_mapping: if len(device_mapping) != len(pipeline_splits) + 1: raise DimensionError( f'The number of device assignments {len(device_mapping)} is not equal to the number of pipeline splits + 1: {len(pipeline_splits) + 1}.' ) if len(set(device_mapping)) != max(device_mapping) + 1: raise DimensionError( f'The model is pipelined over {len(set(device_mapping))} different IPUs, but one or more stages are being assigned to IPU {max(device_mapping) + 1}' ) if len(available_memory_proportion) > 1: if len(available_memory_proportion) != 2 * ( len(pipeline_splits) + 1): raise DimensionError( 'Define a single global value of available memory proportion or two values per pipeline stage. ' f'There are {len(pipeline_splits) + 1} pipeline stages defined and {len(available_memory_proportion)} values of ' 'available memory proportion') options = [ pipelining_ops.PipelineStageOptions( convolution_options={ "availableMemoryProportion": str(available_memory_proportion[2 * idx] / 100.0) }, matmul_options={ "availableMemoryProportion": str(available_memory_proportion[2 * idx + 1] / 100.0) }) for idx in range(len(available_memory_proportion) // 2) ] kwargs = { 'forward_propagation_stages_poplar_options': options, 'backward_propagation_stages_poplar_options': options } else: kwargs = {} model.set_pipelining_options( gradient_accumulation_steps_per_replica= gradient_accumulation_count, pipeline_schedule=pipeline_schedule, device_mapping=device_mapping, offload_weight_update_variables=optimizer_state_offloading, **kwargs) else: model.set_gradient_accumulation_options( gradient_accumulation_steps_per_replica= gradient_accumulation_count, offload_weight_update_variables=optimizer_state_offloading) return model