Esempio n. 1
0
def doubleMAD(vector, threshold=3.5):
    '''
    Returns a boolean array comparing the Modified Z-Score (MZS) to the given threshold factor.
    Only works with 1D arrays (vectors) but can be iterated over for multiple distributions.
    A return of True implies an outlying data point.
    '''

    if vector.ndim is not 1:
        raise DimensionError("Input must be a 1D vector.")

    # Calculate the overall median (allows for masked vectors)
    m = np.ma.median(vector)

    # Calculate the absolute deviation from the true median
    absDev = np.abs(vector - m)

    # Calculate the median absolute deviation for both the left and right splits
    leftMAD = np.ma.median(absDev[vector <= m])
    rightMAD = np.ma.median(absDev[vector >= m])

    vectorMAD = leftMAD * np.ones(len(vector))
    vectorMAD[vector > m] = rightMAD

    # Calculate the modified Z score
    MZS = 0.6745 * absDev / vectorMAD

    # If the value of the vector equals the median, set the MZS to 0
    MZS[vector == m] = 0

    # Return true if the MZS is greater than the threshold
    return MZS > threshold
Esempio n. 2
0
def greyscale(array,
              ax=None,
              cbar=False,
              mask=None,
              show=True,
              filename=None,
              setnan=0.0,
              **kwargs):
    """Basic imshow of array"""

    if array.ndim is 2:
        if mask is not None:
            imshow(np.ma.masked_array(array, mask=mask), ax=ax, **kwargs)
        else:
            imshow(array, ax=ax, **kwargs)
        if cbar:
            plt.colorbar()
        if filename is not None:
            plt.savefig(filename)
        if show:
            plt.show()
        else:
            plt.close()
    else:
        raise DimensionError(
            "Invalid dimensions. Required: 2. (Actual: {})".format(array.ndim))

    return ax
Esempio n. 3
0
def rmsMatrix2D(array, mask=None, nanmask=False):
    '''
    Creates an array of RMS values given a 3D array of data with the first two
    dimensions forming the output matrix and the 3rd dimension containing the
    independent axis.
    '''

    if array.ndim is not 3:
        raise DimensionError("Input array must be 3 dimensional.")

    width, height, depth = array.shape

    if mask is not None:
        if not isinstance(mask, np.ndarray):
            raise TypeError("Mask must be an array.")
        elif mask.ndim is not 1:
            raise DimensionError("Mask must be an array of dimension 1.")
        elif depth != len(mask):
            raise ValueError(
                "Independent dimension and mask must have same length.")

    # Initialize RMS table of zeros
    r = np.zeros((width, height), dtype=float)

    # Initialize the mask array along the 3rd axis
    if mask is None:
        m = np.zeros(depth, dtype=int)
    else:
        m = mask

    # Loop over the other two dimensions
    for i in np.arange(width):
        for j in np.arange(height):

            # Calculate the RMS of the array everywhere the mask is 0
            r[i][j] = rootMeanSquare(array[i][j][m == 0])

            if all(amp == 0 for amp in array[i][j]):
                r[i][j] = np.nan

    # Mask the nan values in the array for potential plotting if needed
    if nanmask:
        r = np.ma.array(r, mask=np.isnan(r))

    # Returns the RMS matrix
    return r
Esempio n. 4
0
    def get_dataset_from_name(ds_name: str,
                              ds_path: str = None,
                              split: str = 'train',
                              seed=42):

        if ds_path is not None:
            if not os.path.exists(ds_path):
                DataGenerator.logger.error(f'{ds_path} does not exist')
                raise NameError(f'Directory {ds_path} does not exist')

        if ds_name not in AVAILABLE_DATASET:
            DataGenerator.logger.info(
                f'Dataset name {ds_name} is not supported. Supported dataset: {list(AVAILABLE_DATASET)}'
            )
            raise NameError(
                f'Dataset name {ds_name} is not supported. Supported dataset: {list(AVAILABLE_DATASET)}'
            )

        if ds_name == 'imagenet':
            return DataGenerator.get_imagenet(ds_path, split)

        else:
            ds, info_ds = tfds.load(
                ds_name,
                data_dir=ds_path,
                split=split,
                # If true, returns `(img, label)` instead of dict(image=, ...)
                as_supervised=True,
                with_info=True)

            if not isinstance(ds, tf.data.Dataset):
                raise UnsupportedFormat(
                    f'Type of ds is not the one expected (tf.data.Dataset) {type(ds)}'
                )

            num_examples = info_ds.splits[split].num_examples

            iterator = iter(ds)
            first_elem = iterator.get_next()

            if len(first_elem[0].shape) != 3:
                raise DimensionError(
                    f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}'
                )

            img_shape = first_elem[0].shape

            num_classes = -1

            if len(info_ds.supervised_keys) == 2:
                label = info_ds.supervised_keys[1]
                num_classes = info_ds.features[label].num_classes
            else:
                raise UnsupportedFormat(
                    f'This function only handle datasets like (features, labels) not {info_ds.supervised_keys}'
                )

            return ds, img_shape, num_examples, num_classes
Esempio n. 5
0
def waterfall(array,
              ax=None,
              offset=None,
              border=0,
              labels=True,
              bins=None,
              show=True,
              **kwargs):
    """
    Waterfall plot of an array. Requires an array of 2 dimensions.
    """

    if array.ndim is 2:
        if offset is None:
            offset = np.max(np.average(array, axis=0))

        fig = plt.figure(figsize=(6, 6))
        bgcolor = 'w'
        ax = fig.add_subplot(111, facecolor=bgcolor, **kwargs)
        color = 'k'

        if bins is None:
            bins = np.arange(array.shape[1])

        x_min = 0
        x_max = len(bins) - 1
        y_min = 0 - offset
        y_max = (1 + len(array)) * offset
        x_low = x_min - (x_max - x_min) * border
        x_high = (x_max - x_min) * border + x_max
        y_low = y_min - (y_max - y_min) * border
        y_high = (y_max - y_min) * border + y_max

        for i in np.arange(len(array)):
            ax.plot(array[i][bins] + offset * i, color)

        ax.set_xlim(x_low, x_high)
        ax.set_ylim(y_low, y_high)

        if not labels:
            ax.set_xticklabels([])
            ax.set_yticklabels([])
        if show:
            plt.show()
        else:
            plt.close()
    else:
        raise DimensionError(
            "Invalid dimensions. Required: 2. (Actual: {})".format(array.ndim))

    return ax
Esempio n. 6
0
    def get_dataset_from_directory(ds_path: str, split: str, seed=42):

        if not os.path.exists(ds_path):
            DataGenerator.logger.error(f'{ds_path} does not exist')
            raise NameError(f'Directory {ds_path} does not exist')

        builder = tfds.folder_dataset.ImageFolder(ds_path)

        info_ds = builder.info

        ds = builder.as_dataset(as_supervised=True, split=split)

        if not isinstance(ds, tf.data.Dataset):
            raise UnsupportedFormat(
                f'Type of ds is not the one expected (tf.data.Dataset) {type(ds)}'
            )

        num_examples = DataGenerator.evaluate_size_dataset(ds)

        iterator = iter(ds)
        first_elem = iterator.get_next()

        if len(first_elem[0].shape) != 3:
            raise DimensionError(
                f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}'
            )

        img_shape = first_elem[0].shape

        num_classes = -1

        if len(info_ds.supervised_keys) == 2:
            label = info_ds.supervised_keys[1]
            num_classes = info_ds.features[label].num_classes
        else:
            raise UnsupportedFormat(
                f'This function only handle datasets like (features, labels) not {info_ds.supervised_keys}'
            )

        print(
            f'img shape {img_shape} number of examples {num_examples} number of classes {num_classes}'
        )

        if popdist.getNumInstances() > 1:
            ds = ds.shard(num_shards=popdist.getNumInstances(),
                          index=popdist.getInstanceIndex())

        return ds, img_shape, num_examples, num_classes
Esempio n. 7
0
def get_1D_OPW_mask(vector, **kwargs):

    if vector.ndim is not 1:
        raise DimensionError(
            "Input data must be 1 dimensional to create an OPW mask.")

    mask = []
    sp_dat = SinglePulse(vector, **kwargs)

    while len(mask) < len(vector):
        if len(mask) in sp_dat.opw:
            mask.append(False)
        else:
            mask.append(True)

    mask = np.asarray(mask)
    return mask
Esempio n. 8
0
def histogram_and_curves(array,
                         mean=0.0,
                         std_dev=1.0,
                         bins=None,
                         x_lims=None,
                         y_lims=None,
                         x_axis='X',
                         y_axis='Y',
                         title='Title',
                         show=True,
                         filename=None,
                         curve_list=None,
                         labels=None,
                         **kwargs):
    """
    Histogram plotter for 1 or 2D data. Can compare PDFs in 1D

    Parameters
    ----------
    array      : np.ndarray
        1 or 2D data array
    mean       : int, float, [int, int], [float, float]
        Calculated mean of data
    std_dev    : int, float, [int, int], [float, float]
        Calculated standard deviation of data
    bins       : int
        Number of bins in histogram
    x_lims, y_lims : [int, int], [float, float]
        x and y limits of the plot
    x_axis, y_axis, title : str
        x, y and title names
    show       : bool
        Show plots (default is False)
    filename   : str
        Name of the file to save to (if None, the plot will not be saved)
    curve_list : list of callables
        List of curves to fit to the data as defined by the user
    labels     : [str, str, ...]
        List of legend labels for the curve list
    **kwargs
        kwargs passed to np.hist()

    Returns
    -------
    matplotlib Axes : ax
    """

    color = 'k'
    bgcolor = 'w'
    style = 'stepfilled'

    # Set up figure and axes
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111, facecolor=bgcolor)
    xText = ax.set_xlabel(x_axis)
    yText = ax.set_ylabel(y_axis)
    title = ax.set_title(title)

    # Convert any lists or dicts to numpy arrays
    if not isinstance(array, np.ndarray):
        array = np.array(array)

    if array.ndim is 1:

        # Number of histogram bins to use (if not supplied by user)
        if bins is None:
            step = (math.ceil(np.amax(array)) - math.floor(np.amin(array))) / (
                20 * abs(math.ceil(np.amax(array))))
            bins = np.arange(math.floor(np.amin(array)),
                             math.ceil(np.amax(array)), step)

        if not x_lims:
            x_min = mean - (4 * std_dev)
            x_max = mean + (4 * std_dev)
        else:
            x_min, x_max = x_lims

        # Linespace for curve plotting
        t = np.arange(x_min, x_max, 0.01)

        # Plot the 1D histogram
        n, bins, patches = ax.hist(array,
                                   bins=bins,
                                   density=True,
                                   color=color,
                                   histtype=style,
                                   linewidth=2,
                                   **kwargs)

        xlim = ax.set_xlim(x_min, x_max)
        ylim = ax.set_ylim(0, 1.2 * np.amax(n))

        # Plot distribution curves
        if curve_list:
            for i, curve in enumerate(curve_list):

                # Selects color from list
                color_index = i % len(color_list)
                color = color_list[color_index]

                # Find the number of fitting arguments in the desired curve
                p0_len = u.get_unique_fitting_parameter_length(curve)

                if not p0_len:
                    p0 = [mean, std_dev]
                else:
                    p0 = np.ones(p0_len)

                # Try fitting and plotting. If fit doesn't work, just plot the histogram
                try:
                    try:
                        params = opt.curve_fit(curve, bins[1:], n, p0=p0)
                    except RuntimeError:
                        continue

                    line, = ax.plot(t,
                                    curve(t, *params[0]),
                                    color=color,
                                    linewidth=2)
                except TypeError:
                    line, = ax.plot(t, curve(t), color=color, linewidth=2)

            if labels:
                leg_labs = np.zeros(len(labels))

                if len(labels) == len(curve_list):
                    leg_labs[i] = line.set_label(labels[i])
                    ax.legend()

        plt.grid(True)

        # Save file
        if filename is not None:
            plt.savefig(filename)
        if show:
            plt.show()

    elif array.ndim is 2 and array.shape[0] is 2:

        # Basically determines whether mean given was [float, float] or float
        if not x_lims:
            try:
                x_min = mean[0] - (4 * std_dev[0])
                x_max = mean[0] + (4 * std_dev[0])
            except TypeError:
                x_min = mean - (4 * std_dev)
                x_max = mean + (4 * std_dev)
        else:
            x_min, x_max = x_lims

        if not y_lims:
            try:
                y_min = mean[1] - (4 * std_dev[1])
                y_max = mean[1] + (4 * std_dev[1])
            except TypeError:
                y_min = x_min
                y_max = x_max
        else:
            y_min, y_max = y_lims

        # Initialize bin size if not parsed in
        if bins is None:
            step = (math.ceil(np.amax(array)) - math.floor(np.amin(array))) / (
                100 * abs(math.ceil(np.amax(array))))
            bins = [
                np.arange(math.floor(np.amin(array[0])),
                          math.ceil(np.amax(array[0])), step),
                np.arange(math.floor(np.amin(array[1])),
                          math.ceil(np.amax(array[1])), step)
            ]

        # Plot the 2D histogram
        h, x_edge, y_edge, quad_mesh = ax.hist2d(array[0],
                                                 array[1],
                                                 bins=bins,
                                                 **kwargs)

        xlim = ax.set_xlim(x_min, x_max)
        ylim = ax.set_ylim(y_min, y_max)

        if filename is not None:
            plt.savefig(filename)
        if show:
            plt.show()
        else:
            plt.close()

    elif array.ndim is 2:
        raise DimensionError(
            "Invalid array shape. Number of rows required: 2. (Actual: {})".
            format(array.shape[0]))
    else:
        raise DimensionError(
            "Invalid dimensions. Required: 2. (Actual: {})".format(array.ndim))

    return ax
Esempio n. 9
0
    def get_imagenet(path: str,
                     split: str,
                     cycle_length: int = 4,
                     block_length: int = 4):

        # The path is the one of dataset under TFRecord format
        if not os.path.exists(path):
            DataGenerator.logger.error(f'{path} does not exist')
            raise NameError(f'Directory {path} does not exist')

        if split == 'train':
            filenames = glob.glob1(path, 'train*')
            if len(filenames) != 1024:
                DataGenerator.logger.error(
                    f'train directory should contain 1024 tf-record files but it contains {len(filenames)} instead'
                )
                raise ValueError(
                    f'train directory should contain 1024 files but it contains {len(filenames)} instead'
                )

        else:
            filenames = glob.glob1(path, 'validation*')
            if len(filenames) != 128:
                DataGenerator.logger.error(
                    f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead'
                )
                raise ValueError(
                    f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead'
                )

        num_files = len(filenames)

        filenames = list(
            map(lambda filename: os.path.join(path, filename), filenames))
        DataGenerator.logger.debug(f'filenames = {filenames}')
        ds = tf.data.Dataset.from_tensor_slices(filenames)

        if split == 'train':
            # Shuffle the input files
            ds = ds.shuffle(buffer_size=num_files)

        if popdist.getNumInstances() > 1:
            ds = ds.shard(num_shards=popdist.getNumInstances(),
                          index=popdist.getInstanceIndex())

        ds = ds.interleave(tf.data.TFRecordDataset,
                           cycle_length=cycle_length,
                           block_length=block_length,
                           num_parallel_calls=cycle_length)

        DataGenerator.logger.info(f'dataset = {ds}')

        num_examples = IMAGENET_DS_SIZE[split]

        DataGenerator.logger.info(f'number of examples {num_examples}')

        iterator = iter(ds)
        first_elem = iterator.get_next()

        feature, _ = imagenet_processing.parse_record(first_elem, True,
                                                      tf.float32)

        if len(feature.shape) != 3:
            raise DimensionError(
                f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}'
            )

        num_classes = 1000
        ds = ds.cache()

        return ds, feature.shape, num_examples, num_classes
Esempio n. 10
0
            f'--logs-per-epoch should be non-negative (>=0), it is {logs_per_epoch}'
        )

    # check for partial logs, example --logs-per-epoch 0.5 and --epochs 5
    if (logs_per_epoch > 0) and (logs_per_epoch < 1) and (
            num_epochs % (1 / logs_per_epoch) != 0):
        raise ValueError(
            f'It is not possible to log {1/logs_per_epoch} epochs a time for {num_epochs} epochs'
        )

    num_pipeline_stages = len(pipeline_splits) + 1

    if device_mapping:
        if len(device_mapping) != num_pipeline_stages:
            raise DimensionError(
                f'The number of device assignments {len(device_mapping)} is not equal to the number of pipeline splits + 1: {num_pipeline_stages}.'
            )

        if len(set(device_mapping)) != max(device_mapping) + 1:
            raise DimensionError(
                f'The model is pipelined over {len(set(device_mapping))} different IPUs, but one or more stages are being assigned to IPU {max(device_mapping) + 1}'
            )

    if eight_bit_transfer and not accelerator_side_preprocess:
        raise UnallowedConfigurationError(
            f'When eight bit transfer is enabled the normalisation must be done on the device. '
            f'If you want to keep 8bit io, set --accelerator-side-preprocess to True.'
        )

    if (eight_bit_transfer
            or accelerator_side_preprocess) and 'cifar' in model_name:
Esempio n. 11
0
def get_best_gaussian_fit(x,
                          y,
                          remove_base=True,
                          m_gauss=15,
                          bp=15,
                          p_wid=150,
                          guess=[1024, 20, 6000],
                          plot_chisq=True,
                          save_dir=None,
                          f_pre="",
                          verbose=True):

    if remove_base:
        y = removeBase(y, 0.05)

    if not isinstance(guess, np.ndarray):
        guess_shape = np.array(guess).shape
    else:
        guess_shape = guess.shape

    n_gauss = 0
    params, c = [], []
    while (len(c) < m_gauss) and (n_gauss < bp):

        if len(guess_shape) == 1:
            params.extend(guess)
        elif len(guess_shape) == 2:
            try:
                params.extend(guess[n_gauss])
            except IndexError:
                params.extend(guess[0])
        else:
            raise DimensionError(
                "Initial guess parameters must be a Nx3 array. Current shape of array is: {}"
                .format(guess_shape))

        n_gauss = len(params) // 3

        try:
            fitted_params, _ = scipy.optimize.curve_fit(multi_norm,
                                                        x,
                                                        y,
                                                        p0=params)
            if (n_gauss == bp) and verbose:
                print("Maximum number of tries reached ({})".format(n_gauss))
        except RuntimeError:
            if len(guess_shape) == 1:
                fitted_params = np.append(fitted_params, guess)
            elif len(guess_shape) == 2:
                try:
                    fitted_params = np.append(fitted_params, guess[n_gauss])
                except IndexError:
                    fitted_params = np.append(fitted_params, guess[0])
            else:
                raise DimensionError(
                    "You definitely shouldn't be able to see this error message."
                )

            if verbose:
                print("No fit for {} gaussians".format(n_gauss))
                if n_gauss == bp:
                    print(
                        "Maximum number of tries reached ({})".format(n_gauss))
            continue

        m = multi_norm(x, *fitted_params)
        mask = get_1D_OPW_mask(m, windowsize=(len(m) - p_wid))
        for i, elem in enumerate(m):
            if mask[i] == False:
                m[i] = 0

        chi2, p = scipy.stats.chisquare(y[mask == 1], f_exp=m[mask == 1])
        if verbose:
            print("Chi-sq for {} gaussians: ".format(n_gauss), chi2)
        c.append(chi2)

    if verbose:
        print_parameters(params, fitted_params, n_gauss)

    if plot_chisq:
        plt.plot(c[1:])
        plt.show()
        plt.close()

    ind_gaussians = get_gaussians(fitted_params, n_gauss)

    if save_dir is not None:
        save_all(save_dir, f_pre, n_gauss, ind_gaussians, m)

    return m, c, ind_gaussians, mask
Esempio n. 12
0
    def configure_model(model: keras.Model,
                        gradient_accumulation_count: int,
                        pipeline_splits: list,
                        device_mapping: list,
                        pipeline_schedule: str,
                        available_memory_proportion: list,
                        optimizer_state_offloading: bool = True):

        if pipeline_splits:
            model = ModelFactory.pipeline_model(model, pipeline_splits)
            pipeline_schedule = next(
                p for p in list(pipelining_ops.PipelineSchedule)
                if pipeline_schedule == str(p).split(".")[-1])

            if device_mapping:
                if len(device_mapping) != len(pipeline_splits) + 1:
                    raise DimensionError(
                        f'The number of device assignments {len(device_mapping)} is not equal to the number of pipeline splits + 1: {len(pipeline_splits) + 1}.'
                    )

                if len(set(device_mapping)) != max(device_mapping) + 1:
                    raise DimensionError(
                        f'The model is pipelined over {len(set(device_mapping))} different IPUs, but one or more stages are being assigned to IPU {max(device_mapping) + 1}'
                    )

            if len(available_memory_proportion) > 1:

                if len(available_memory_proportion) != 2 * (
                        len(pipeline_splits) + 1):
                    raise DimensionError(
                        'Define a single global value of available memory proportion or two values per pipeline stage. '
                        f'There are {len(pipeline_splits) + 1} pipeline stages defined and {len(available_memory_proportion)} values of '
                        'available memory proportion')

                options = [
                    pipelining_ops.PipelineStageOptions(
                        convolution_options={
                            "availableMemoryProportion":
                            str(available_memory_proportion[2 * idx] / 100.0)
                        },
                        matmul_options={
                            "availableMemoryProportion":
                            str(available_memory_proportion[2 * idx + 1] /
                                100.0)
                        })
                    for idx in range(len(available_memory_proportion) // 2)
                ]
                kwargs = {
                    'forward_propagation_stages_poplar_options': options,
                    'backward_propagation_stages_poplar_options': options
                }
            else:
                kwargs = {}

            model.set_pipelining_options(
                gradient_accumulation_steps_per_replica=
                gradient_accumulation_count,
                pipeline_schedule=pipeline_schedule,
                device_mapping=device_mapping,
                offload_weight_update_variables=optimizer_state_offloading,
                **kwargs)

        else:
            model.set_gradient_accumulation_options(
                gradient_accumulation_steps_per_replica=
                gradient_accumulation_count,
                offload_weight_update_variables=optimizer_state_offloading)

        return model