Ejemplo n.º 1
0
def mad_scaling(data):

	L, N = data.shape

	offset = np.mean(data, axis = 0) - mad(data, axis = 0, c = .6);
	top = np.mean(data, axis = 0) + mad(data, axis = 0, c = .6);
	return offset, top
Ejemplo n.º 2
0
 def find_bad_by_hf_noise(self, hf_noise_thresh=3.29053):
     """Detect channels that contain high frequency (hf) noise.
     Use a robust estimate of the power of the high frequency components
     to the power of the low frequency components. This function depends
     on the `low_cut` and `high_cut` parameters given at initialization,
     as they determine the bandpass.
     This function is working on robust z-scores. You might want to
     select the thresholds according to how much of the data is expected
     to fall within the absolute bounds:
     95.0% --> 1.95996
     97.0% --> 2.17009
     99.0% --> 2.57583
     99.9% --> 3.29053
     Parameters
     ----------
     hf_noise_thresh : float
         The threshold for z-scores, when exceeded: classify as bad.
     """
     # Determine z-scored level of estimated signal-to-noise
     # ratio for each channel
     noisiness = (mad(self.x - self.x_bp, c=1, axis=1) /
                  mad(self.x_bp, c=1, axis=1))
     noisiness_median = np.median(noisiness)
     # robust estimate of STD
     noisiness_sd = mad(noisiness, c=1, axis=0) * 1.4826
     hf_noise_z = (noisiness - noisiness_median) / noisiness_sd
     bad_idxs_bool = hf_noise_z > hf_noise_thresh
     bad_idxs = np.argwhere(bad_idxs_bool)
     bads = self.ch_names[bad_idxs.astype(int)]
     bads = [i[0] for i in bads]
     bads.sort()
     self.bad_by_hf_noise = bads
     self._channel_hf_noise = hf_noise_z
     return None
Ejemplo n.º 3
0
def cnr(img, seg, lbl=None):
    r"""
    Calculate the :abbr:`CNR (Contrast-to-Noise Ratio)` [Magnota2006]_.
    Higher values are better.

    .. math::

        \text{CNR} = \frac{|\mu_\text{GM} - \mu_\text{WM} |}{\sigma_B},

    where :math:`\sigma_B` is the standard deviation of the noise distribution within
    the air (background) mask.


    :param numpy.ndarray img: input data
    :param numpy.ndarray seg: input segmentation
    :return: the computed CNR

    """
    if lbl is None:
        lbl = FSL_FAST_LABELS

    noise_std = mad(img[seg == lbl['bg']])
    if noise_std < 1.0:
        noise_std = np.average(mad(img[seg == lbl['gm']]) +
                               mad(img[seg == lbl['wm']]) +
                               mad(img[seg == lbl['csf']]))

    return float(np.abs(np.median(img[seg == lbl['gm']]) - np.median(img[seg == lbl['wm']])) / \
                 noise_std)
 def test_mad_empty(self):
     empty = np.empty(0)
     assert np.isnan(scale.mad(empty))
     empty = np.empty((10, 100, 0))
     assert_equal(scale.mad(empty, axis=1), np.empty((10, 0)))
     empty = np.empty((100, 100, 0, 0))
     assert_equal(scale.mad(empty, axis=-1), np.empty((100, 100, 0)))
 def test_mad_center(self):
     n = scale.mad(self.X, center=0)
     assert_equal(n.shape, (10, ))
     with pytest.raises(TypeError):
         scale.mad(self.X, center=None)
     assert_almost_equal(
         scale.mad(self.X, center=1),
         np.median(np.abs(self.X - 1), axis=0) / Gaussian.ppf(3 / 4.),
         DECIMAL)
Ejemplo n.º 6
0
def test_mad_axis_none():
    # GH 7027
    a = np.array([[0, 1, 2], [2, 3, 2]])

    def m(x):
        return np.median(x)

    direct = mad(a=a, axis=None)
    custom = mad(a=a, axis=None, center=m)
    axis0 = mad(a=a.ravel(), axis=0)

    np.testing.assert_allclose(direct, custom)
    np.testing.assert_allclose(direct, axis0)
Ejemplo n.º 7
0
 def _estimate_scale(self, resid):
     """
     Estimates the scale based on the option provided to the fit method.
     """
     if isinstance(self.scale_est, str):
         if self.scale_est.lower() == "mad":
             return scale.mad(resid, center=0)
         if self.scale_est.lower() == "stand_mad":
             return scale.mad(resid)
         else:
             raise ValueError("Option %s for scale_est not understood" % self.scale_est)
     elif isinstance(self.scale_est, scale.HuberScale):
         return self.scale_est(self.df_resid, self.nobs, resid)
     else:
         return scale.scale_est(self, resid) ** 2
Ejemplo n.º 8
0
    def denoise(self):
        """denoise the data using the 2stage kurtosis denoising"""

        #make sure the data has a len dividible by 2^2
        self.len_swt = self.len
        while not (self.len_swt / 4).is_integer():
            self.len_swt -= 1

        inp = self.input_nobase[:self.len_swt]
        self.wave = pywt.Wavelet(self.wave_type)
        nLevel = pywt.swt_max_level(self.len_swt)
        self.coeffs = pywt.swt(inp, self.wave, level=2)

        print(" \t Denoise STW coefficients \t %1.2f %1.2f" %
              (self.TK, self.TT))
        (cA2, cD2), (cA1, cD1) = self.coeffs

        # rolling kurtosis
        k2 = self._rolling_kts(cD2, self.nwin)
        k1 = self._rolling_kts(cD1, self.nwin)

        # thresholding
        cD2[k2 < self.TK] = 0
        cD1[k1 < self.TK] = 0

        cA2[k2 < self.TK] = 0
        cA1[k1 < self.TK] = 0

        # universal threshold
        sigma_roll_1 = mad(cD1[cD1 != 0]) * np.ones(self.len_swt)
        uthresh_roll_1 = self.TT * sigma_roll_1 * np.sqrt(
            2 * np.log(self.len_swt))
        cD1[abs(cD1) < uthresh_roll_1] = 0

        # universal threshold
        sigma_roll_2 = mad(cD2[cD2 != 0]) * np.ones(self.len_swt)
        uthresh_roll_2 = self.TT * sigma_roll_2 * np.sqrt(
            2 * np.log(self.len_swt))
        cD2[abs(cD2) < uthresh_roll_2] = 0

        # final threshold
        cA1[cD1 == 0] = 0
        cA2[cD2 == 0] = 0
        self.denoised_coeffs = [(cA1, cD1), (cA2, cD2)]

        # denoise the data
        #self.input_denoised = self._iswt(self.denoised_coeffs,self.wave)
        self.input_denoised = pywt.iswt(self.denoised_coeffs, self.wave)
Ejemplo n.º 9
0
def compute_sensitivity_map(model_params, method, xcenters, ycenters, residuals, knots, nearIndices, xBinSize, yBinSize, ind_kdtree, gw_kdtree, pld_intensities, model):
    if 'bliss' in method.lower():
        normFactor = (1/xBinSize) * (1/yBinSize)
        sensitivity_map = bliss.BLISS(xcenters, ycenters, residuals, knots, nearIndices, xBinSize=xBinSize, yBinSize=xBinSize, normFactor=normFactor)
    elif 'krdata' in method.lower():
        sensitivity_map  = np.sum(residuals[ind_kdtree]  * gw_kdtree, axis=1)
    elif 'pld' in method.lower():
        PLDcoeffs = [val.value for val in model_params.values() if 'pld' in val.name.lower()]
        sensitivity_map = np.dot(PLDcoeffs, pld_intensities)
    else:
        print('INVALID METHOD: ABORT!')
    
    nSig = 10
    vbad_sm = np.where(abs(sensitivity_map - np.median(sensitivity_map)) > nSig*scale.mad(sensitivity_map))[0]
    if len(sensitivity_map)-1 in vbad_sm:
        vbad_sm = list(set(vbad_sm) - set([len(sensitivity_map)-1]))
        end_corner_case = True
    else:
        end_corner_case = False
    if 0 in vbad_sm:
        vbad_sm = list(set(vbad_sm) - set([0]))
        start_corner_case = True
    else:
        start_corner_case = False
    
    vbad_sm = np.array(vbad_sm)
    sensitivity_map[vbad_sm] = 0.5*(sensitivity_map[vbad_sm-1] + sensitivity_map[vbad_sm+1])
    
    if end_corner_case: sensitivity_map[-1] = sensitivity_map[2]
    if start_corner_case: sensitivity_map[0] = sensitivity_map[1]

    return sensitivity_map
Ejemplo n.º 10
0
def mad_outliers(data, genes, threshold, percentile=95, as_json=True):
    res = dr.get_dataset_ensembl_info()
    outliers_id = []
    if as_json:
        yield ("{\"outliers\":[")
    for g in genes:
        row_values = data.loc[g, :]
        cut_row_values = row_values
        med = cut_row_values.median()
        row_mad = mad(cut_row_values)

        if row_mad != 0.0:
            filtered = (cut_row_values - med) / row_mad
            support = len(filtered[filtered > threshold])

            if scoreatpercentile(filtered, 95) > threshold:

                info = [gene for gene in res if gene.ensemblgeneid == g][0]
                formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                  "distance": "NA"}
                jinfo = json.dumps(formatted_info)
                jinfo += ","
                outliers_id.append(g)
                print("outlier found :" + g)
                if as_json:
                    yield (jinfo)
                else:
                    yield (formatted_info)
    if len(outliers_id) > 0:
        pr.save_outliers(1, outliers_id)

    if as_json:
        yield ("]}")
Ejemplo n.º 11
0
def MT_cutoff(ax, df, key, mads, max_value, mt_perc):
    if key not in df.columns:
        print('Error {key} not in the barcode data columns')
        sys.exit()
    mols = df['molecules'].values
    pmt = 100.0 * df[key].values / mols
    med = NP.median(pmt)
    if mt_perc < 0.0:
        md = mad(pmt)
        co = min(mads * md + med, max_value)
    else:
        co = mt_perc

    ax.axhline(med, color=plt.cm.Set1.colors[0])
    ax.axhline(co, color=plt.cm.Set1.colors[0], ls='--')
    density_scatter(ax, mols, pmt, logx=True)
    ax.minorticks_on()
    ax.set_axisbelow(True)
    ax.grid(color='0.5', lw=0.5, ls='--')
    ax.set_xlabel('Molecules')
    ax.set_ylabel('MT (%)')
    passed = pmt <= co
    df['passed'] = df['passed'] & passed
    ax.set_title(
        f'Cutoff = {co:.2f}%\nPassed = {passed.sum():,} / {len(df):,}')
Ejemplo n.º 12
0
def summary_stats(x, save_table=False, print_table=True):
    """Descriptive stats for input np array or pd series x"""

    mean = x.mean()  # using array method (Numpy) / Series method (Pandas)

    median = x.median()  # using array method (Numpy) / Series method (Pandas)

    std = x.std()  # using array method (Numpy) / Series method (Pandas)

    iqr = ss.iqr(x)  # using Scipy Stats

    # c is a normalization constant
    # that we only need/want if we are relating MAD to the standard deviation
    #   https://en.wikipedia.org/wiki/Median_absolute_deviation#Relation_to_standard_deviation
    mad_sm = mad(
        x, c=1)  # using StatsModels (Pandas only has *mean* absolute dev)

    skew = ss.skew(x)  # using Scipy Stats; Series also have skew() method

    skew_yk = calc_YK_pd(x)

    names = ['mean', 'median', 'std', 'IQR', 'MAD', 'skewness', 'Y-K']
    varz = [mean, median, std, iqr, mad_sm, skew,
            skew_yk]  # `vars` is a built-in
    d = OrderedDict([(n, v) for n, v in zip(names, varz)])

    return d
Ejemplo n.º 13
0
 def _filter_peaks_without_replicates(self, df):
     # calculate mad for original data frame
     median_abs_dev_from_zero = mad(df.loc[:, self._exp_lib_list].mean(
         axis=1), center=0.0)
     # minimum expression cutoff based on mean over experiment libraries
     print("Removing peaks based on mad cutoff from DataFrame "
           "with {} rows...".format(len(df)), flush=True)
     t_start = time()
     min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
     print("Minimal peak expression based on mean over RIP/CLIP "
           "libraries:" "{} (MAD from zero: {})".format(
               min_expr, median_abs_dev_from_zero), flush=True)
     df = df.loc[df.loc[:, self._exp_lib_list].mean(axis=1) >= min_expr, :]
     t_end = time()
     print("Removal took {} seconds. DataFrame contains now {} rows.".
           format((t_end-t_start), len(df)), flush=True)
     if df.empty:
         return df
     # minimum fold change
     print("Removing windows based on minimum fold change from DataFrame "
           "with {} rows...".format(len(df)), flush=True)
     t_start = time()
     df = df.query('fold_change >= @self._fc_cutoff')
     t_end = time()
     print("Removal took {} seconds. DataFrame contains now {} rows.".
           format((t_end-t_start), len(df)), flush=True)
     return df
Ejemplo n.º 14
0
    def find_bad_by_flat(self, flat_thresh=1, std_thresh=1):
        """Detect channels containing constant or very small values.

        Use the median absolute deviation and the standard deviation
        to find channels that have consistently low values.

        Parameters
        ----------
        flat_thresh : float
            Channels with a median absolute deviation below `flat_thresh`
            will be considered bac_by_flat.

        std_thresh : float
            Channels with a standard deviation below `std_thresh`
            will be considered bad_by_flat.

        """
        bad_by_mad = mad(self.x, c=1, axis=1) < flat_thresh
        bad_by_std = np.std(self.x, axis=1) < std_thresh
        bad_idxs = np.argwhere(np.logical_or(bad_by_mad, bad_by_std))
        bads = self.ch_names[bad_idxs.astype(int)]
        bads = [i[0] for i in bads]
        bads.sort()
        self.bad_by_flat = bads
        return None
Ejemplo n.º 15
0
def artifact_mask(imdata, airdata, distance, zscore=10.):
    """Computes a mask of artifacts found in the air region"""
    from statsmodels.robust.scale import mad

    if not np.issubdtype(airdata.dtype, np.integer):
        airdata[airdata < .95] = 0
        airdata[airdata > 0.] = 1

    bg_img = imdata * airdata
    if np.sum((bg_img > 0).astype(np.uint8)) < 100:
        return np.zeros_like(airdata)

    # Find the background threshold (the most frequently occurring value
    # excluding 0)
    bg_location = np.median(bg_img[bg_img > 0])
    bg_spread = mad(bg_img[bg_img > 0])
    bg_img[bg_img > 0] -= bg_location
    bg_img[bg_img > 0] /= bg_spread

    # Apply this threshold to the background voxels to identify voxels
    # contributing artifacts.
    qi1_img = np.zeros_like(bg_img)
    qi1_img[bg_img > zscore] = 1
    qi1_img[distance < .10] = 0

    # Create a structural element to be used in an opening operation.
    struc = nd.generate_binary_structure(3, 1)
    qi1_img = nd.binary_opening(qi1_img, struc).astype(np.uint8)
    qi1_img[airdata <= 0] = 0

    return qi1_img
Ejemplo n.º 16
0
 def _estimate_scale(self, resid):
     """
     Estimates the scale based on the option provided to the fit method.
     """
     if isinstance(self.scale_est, str):
         if self.scale_est.lower() == 'mad':
             return scale.mad(resid, center=0)
         if self.scale_est.lower() == 'stand_mad':
             return scale.mad(resid)
         else:
             raise ValueError("Option %s for scale_est not understood" %
                              self.scale_est)
     elif isinstance(self.scale_est, scale.HuberScale):
         return self.scale_est(self.df_resid, self.nobs, resid)
     else:
         return scale.scale_est(self, resid)**2
Ejemplo n.º 17
0
 def _prefilter_windows_deseq(self, df):
     print("Removing windows where not all experiment libs show "
           "expression from DataFrame with {} rows...".format(len(df)),
           flush=True)
     t_start = time()
     for exp_lib in self._exp_lib_list:
         exp_lib_zero_count = 0.0
         df = df.loc[(df.loc[:, exp_lib] > exp_lib_zero_count), :]
     t_end = time()
     print("Removal took {} seconds. DataFrame contains now {} rows.".
           format((t_end-t_start), len(df)), flush=True)
     if df.empty:
         return df
     initial_window_df = df.copy()
     # normalize counts on initial windows
     initial_window_df[self._lib_names_list] = initial_window_df[
         self._lib_names_list].div(self._size_factors, axis='columns')
     # minimum expression cutoff based on mean over experiment libraries
     print("Removing windows based on mad cutoff from DataFrame "
           "with {} rows...".format(len(df)), flush=True)
     t_start = time()
     median_abs_dev_from_zero = mad(initial_window_df.loc[
         :, self._exp_lib_list].mean(axis=1), center=0.0)
     min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
     print("Minimal window expression based on mean over RIP/CLIP "
           "libraries: {} (MAD from zero: {})".format(
               min_expr, median_abs_dev_from_zero), flush=True)
     df = df.loc[initial_window_df.loc[:, self._exp_lib_list].mean(
         axis=1) >= min_expr, :]
     t_end = time()
     print("Removal took {} seconds. DataFrame contains now {} rows.".
           format((t_end-t_start), len(df)), flush=True)
     return df
Ejemplo n.º 18
0
def summary_stats(x, save_table=False):
    """Create descriptive stats table for input np array or pd series x
    save table if desired
    """

    mean = x.mean()  # using array method (Numpy) / Series method (Pandas)
    assert (np.isclose(mean, x.sum() / x.size))

    median = x.median()  # using array method (Numpy) / Series method (Pandas)
    assert (np.isclose(median, calc_median(x)))

    std = x.std()  # using array method (Numpy) / Series method (Pandas)
    assert (np.isclose(std, calc_std(x)))

    iqr = ss.iqr(x)  # using Scipy Stats
    assert (np.isclose(iqr, calc_iqr(x)))
    assert (np.isclose(iqr, calc_iqr_pd(x)))

    # c is a normalization constant
    # that we only if we are relating MAD to the standard deviation
    # https://en.wikipedia.org/wiki/Median_absolute_deviation#Relation_to_standard_deviation
    mad = mad(x,
              c=1)  # using StatsModels (Pandas only has *mean* absolute dev)
    assert (np.isclose(mad, calc_mad(x)))

    skew = ss.skew(x)  # using Scipy Stats; Series also have skew() method
    assert (np.isclose(skew, calc_skew(x)))
Ejemplo n.º 19
0
def artifact_mask(imdata, airdata, distance, zscore=10.):
    """Computes a mask of artifacts found in the air region"""
    from statsmodels.robust.scale import mad

    if not np.issubdtype(airdata.dtype, np.integer):
        airdata[airdata < .95] = 0
        airdata[airdata > 0.] = 1

    bg_img = imdata * airdata
    if np.sum((bg_img > 0).astype(np.uint8)) < 100:
        return np.zeros_like(airdata)

    # Find the background threshold (the most frequently occurring value
    # excluding 0)
    bg_location = np.median(bg_img[bg_img > 0])
    bg_spread = mad(bg_img[bg_img > 0])
    bg_img[bg_img > 0] -= bg_location
    bg_img[bg_img > 0] /= bg_spread

    # Apply this threshold to the background voxels to identify voxels
    # contributing artifacts.
    qi1_img = np.zeros_like(bg_img)
    qi1_img[bg_img > zscore] = 1
    qi1_img[distance < .10] = 0

    # Create a structural element to be used in an opening operation.
    struc = nd.generate_binary_structure(3, 1)
    qi1_img = nd.binary_opening(qi1_img, struc).astype(np.uint8)
    qi1_img[airdata <= 0] = 0

    return qi1_img
Ejemplo n.º 20
0
def gcor(func, mask=None):
    """
    Compute the :abbr:`GCOR (global correlation)` [Saad2013]_.

    :param numpy.ndarray func: input fMRI dataset, after motion correction
    :param numpy.ndarray mask: 3D brain mask
    :return: the computed GCOR value

    """
    import numpy as np
    from statsmodels.robust.scale import mad

    # Reshape to N voxels x T timepoints
    func_v = func.reshape(-1, func.shape[-1])

    if mask is not None:
        func_v = np.squeeze(func_v.take(np.where(mask.reshape(-1) > 0), axis=0))

    func_sigma = mad(func_v, axis=1)
    mask = np.zeros_like(func_sigma)
    mask[func_sigma > 1.e-5] = 1

    # Remove zero-variance voxels across time axis
    func_v = np.squeeze(func_v.take(np.where(mask > 0), axis=0))
    func_sigma = func_sigma[mask > 0]
    func_mean = np.median(func_v, axis=1)

    zscored = func_v - func_mean[..., np.newaxis]
    zscored /= func_sigma[..., np.newaxis]

    # avg_ts is an N timepoints x 1 vector
    avg_ts = zscored.mean(axis=0)
    return float(avg_ts.T.dot(avg_ts) / len(avg_ts))
Ejemplo n.º 21
0
Archivo: misc.py Proyecto: kaitj/mriqc
def reorient_and_discard_non_steady(in_file, float32=False):
    import nibabel as nb
    import os
    import numpy as np
    import nibabel as nb
    from statsmodels.robust.scale import mad

    _, outfile = os.path.split(in_file)

    nii = nb.as_closest_canonical(nb.load(in_file))
    in_data = nii.get_data()

    # downcast to reduce space consumption and improve performance
    if float32 and np.dtype(in_data.dtype).itemsize > 4:
        in_data = in_data.astype(np.float32)

    data = in_data[:, :, :, :50]
    timeseries = data.max(axis=0).max(axis=0).max(axis=0)
    outlier_timecourse = (timeseries - np.median(timeseries)) / mad(
        timeseries)
    exclude_index = 0
    for i in range(10):
        if outlier_timecourse[i] > 10:
            exclude_index += 1
        else:
            break

    nb.Nifti1Image(in_data[:, :, :, exclude_index:], nii.affine, nii.header).to_filename(outfile)
    nii.uncache()
    return exclude_index, os.path.abspath(outfile)
Ejemplo n.º 22
0
def med_mad(x, cut_point=3.5, cut_off=True):
    medmad = np.abs((x - np.median(x)) / mad(x))

    if cut_off:
        return medmad < cut_point
    else:
        return medmad
Ejemplo n.º 23
0
    def __getitem__(self, index):
        selection_index = self._selection_order[index]
        x_data = self._x_data[selection_index:selection_index +
                              self._sequence_length]
        local_median = np.median(x_data)
        local_median_absolute_deviation = scale.mad(x_data, c=1)
        local_mean = np.mean(x_data)
        # x_data_last = x_data[self._actual_node]

        data = [
            self._median, self._median_absolute_deviation, self._mean,
            local_median, local_median_absolute_deviation, local_mean
        ]
        for item in x_data:
            data.append(item)
            data.append(item - self._mean)
            data.append(item - self._median)
            data.append(item - self._median_absolute_deviation)
            data.append(item - local_mean)
            data.append(item - local_median)
            data.append(item - local_median_absolute_deviation)

        # return np.reshape(x_data, (NUMBER_CHANNELS, -1)), values, self._y_data[selection_index + self._actual_node]
        return np.array(data), self._y_data[selection_index +
                                            self._actual_node]
Ejemplo n.º 24
0
def get_saccade_bounds(chunks_eye):
    ubs = np.zeros(chunks_eye.shape[1])
    lbs = np.zeros(chunks_eye.shape[1])
    mean_sub_chunks_eye = chunks_eye - chunks_eye.mean(axis=1, keepdims=True)

    # Desaccading Procedure
    clipped_mask = np.ones(chunks_eye.shape[0], dtype=bool)
    for t in range(chunks_eye.shape[1]):
        # First, take a column of the chunks array to get a distribution for
        # a single time point
        clipped_distribution = mean_sub_chunks_eye[:, t]

        clipped_mask[(clipped_distribution > 40) |
                     (clipped_distribution < -40)] = False
        if sum(clipped_mask) > 5:
            clipped_distribution = clipped_distribution[clipped_mask]
        # Then, clip that distribution to only contain feasible velocities. For this
        # data, feasible values were between -40 and 40
        # clipped_distribution = [val for val in tp_distribution if val > -40 and val < 40]

        # Check to see that the clipped distribution is non-empty. If it is, reset it to be the
        # unprocessed timepoint distribution (may need to revise this later)
        # if len(clipped_distribution) == 0:
        #   clipped_distribution = tp_distribution

        # Compute the Median Absolute Deviation statistic, and use it to set the desaccading bounds
        # with relation to the median. (Also may want to change this later, possibly to a mode-based
        # stat or asymmetric trimmed mean)
        MAD = mad(clipped_distribution)
        median = np.median(clipped_distribution)
        ubs[t] = median + 3.5 * MAD
        lbs[t] = median - 3.5 * MAD
    saccade_bounds_dict = dict(zip(["ubs", "lbs"], [ubs, lbs]))
    return saccade_bounds_dict
Ejemplo n.º 25
0
def robust_stats(xs):
    """
    https://en.wikipedia.org/wiki/
    Median_absolute_deviation#Relation_to_standard_deviation
    """
    scale = mad(xs) * 1.4826
    loc = np.median(xs)
    return loc, scale
def clipOutlier2D(arr2D, nSig=10):
    arr2D     = arr2D.copy()
    medArr2D  = median(arr2D,axis=0)
    sclArr2D  = np.sqrt(((scale.mad(arr2D)**2.).sum()))
    outliers  = abs(arr2D - medArr2D) >  nSig*sclArr2D
    inliers   = abs(arr2D - medArr2D) <= nSig*sclArr2D
    arr2D[outliers] = median(arr2D[inliers],axis=0)
    return arr2D
    def transform_into_feature_row(self, rr, hrdata=None):
        hr = hrdata
        if hr is None:
            hr = 60 / rr
        hrv_attrs = self.calculate_hrv_attrs(rr)
        mean_hr = np.mean(hr)
        mean_rr = np.mean(rr)
        std_hr = np.std(hr)
        std_rr = hrv_attrs['sdnn']

        hr_above_mean_plus_std = hr[np.where(hr > mean_hr + std_hr)]
        hr_below_mean_minus_std = hr[np.where(hr < mean_hr - std_hr)]
        rr_above_mean_plus_std = rr[np.where(rr > mean_rr + std_rr)]
        rr_below_mean_minus_std = rr[np.where(rr < mean_rr - std_rr)]

        attrs = {
            'mean_hr': mean_hr,
            'min_hr': hr.min(),
            'max_hr': hr.max(),
            'std_hr': std_hr,
            'kurtosis_hr': kurtosis(hr),
            'skewness_hr': skew(hr),
            'hr_above_mean_plus_std': len(hr_above_mean_plus_std) / len(hr),
            'hr_below_mean_minus_std': len(hr_below_mean_minus_std) / len(hr),
            'mad_rr': mad(rr),
            'mean_rr': mean_rr,
            'kurtosis_rr': kurtosis(rr),
            'skewness_rr': skew(rr),
            'rr_above_mean_plus_std': len(rr_above_mean_plus_std) / len(rr),
            'rr_below_mean_minus_std': len(rr_below_mean_minus_std) / len(rr),
            'sdnn': hrv_attrs['sdnn'],
            'rmssd': hrv_attrs['rmssd'],
            'sdsd': hrv_attrs['sdsd'],
            'pnn20': hrv_attrs['pnn20'],
            'pnn50': hrv_attrs['pnn50'],
            'sd1': hrv_attrs['sd1'],
            'sd2': hrv_attrs['sd2'],
            'sd2_sd1_ratio': hrv_attrs['sd2_sd1_ratio'],
            'lf': hrv_attrs['lf'],
            'hf': hrv_attrs['hf'],
            'lfhf': hrv_attrs['lfhf']
        }

        attrs_as_row = [
            attrs['mean_hr'], attrs['min_hr'], attrs['max_hr'],
            attrs['std_hr'], attrs['kurtosis_hr'], attrs['skewness_hr'],
            attrs['hr_above_mean_plus_std'], attrs['hr_below_mean_minus_std'],
            attrs['mad_rr'], attrs['mean_rr'], attrs['kurtosis_rr'],
            attrs['skewness_rr'], attrs['rr_above_mean_plus_std'],
            attrs['rr_below_mean_minus_std'], attrs['sdnn'], attrs['rmssd'],
            attrs['sdsd'], attrs['pnn20'], attrs['pnn50'], attrs['sd1'],
            attrs['sd2'], attrs['sd2_sd1_ratio'], attrs['lf'], attrs['hf'],
            attrs['lfhf']
        ]

        return attrs, attrs_as_row
Ejemplo n.º 28
0
def rejecttrials(x, thresh=5.0, bipolar=True):
    """Simple function to reject trials from numpy array data

    Parameters
    ----------
    x : ndarray, shape (n_trials, n_time)
        Data as numpy array
    thresh : float, optional, default 1.5
        Threshold in number of median absolute deviations
    bipolar : boolean, optional, default True
        If odd (even) epoch is bad, also remove next even (previous odd) trial

    Returns
    -------
    list, length n_good_trials
        original array with bad epochs removed

    """

    n_trials, n_times = x.shape
    x_max = mad(x, axis=1)
    x_med = np.median(x_max)
    x_mad = mad(x_max)
    bads = []
    for k in range(n_trials):
        if np.abs(x_max[k] - x_med) > thresh * x_mad:
            bads += [
                k,
            ]
            if bipolar is True:
                if np.mod(k, 2) == 0:
                    bads += [
                        k + 1,
                    ]
                else:
                    bads += [
                        k - 1,
                    ]
        else:
            pass
    goods = np.setdiff1d(range(n_trials), np.unique(bads))
    print '%d Good trials Found' % len(goods)
    return goods
Ejemplo n.º 29
0
def cut_modified(x,q, use_mad_for_std=True):
    try:
        quantiles_in_sigmas = np.asarray(map(normal.ppf, q))
        x_clean = x.dropna()
        mean = np.mean(x_clean)
        std = np.std(x_clean) if not use_mad_for_std else mad(x_clean)
        bins = mean + quantiles_in_sigmas*std
        bins = np.sort(np.append(bins, (x_clean.min()-1E-6, x_clean.max()+1E-6)))
        return pd.cut(x, bins, labels=range(len(bins)-1))
    except ValueError as e:
        return [pd.np.NaN]*len(x)
Ejemplo n.º 30
0
 def __init__(self, selection_order, x_data, y_data, sequence_length):
     self._x_data = x_data
     self._y_data = y_data
     self._selection_order = selection_order
     self._length = len(selection_order)
     self._sequence_length = sequence_length
     self._actual_node = self._sequence_length / 2
     self._median = np.median(x_data)
     self._median_absolute_deviation = scale.mad(x_data, c=1)
     self._mean = np.mean(x_data)
     LOGGER.debug('Length: {}'.format(self._length))
Ejemplo n.º 31
0
def independent_variable_model_collapse(model,independent_column_name="Frequency", **options):
    """Returns a model with a single set of independent variables. Default is to average values together
    but geometric mean, std, variance, rss, mad and median are options.
    Geometric means of odd number of negative values fails"""
    if isinstance(model,pandas.DataFrame):
        model_1 = DataFrame_to_AsciiDataTable(model)
    defaults = {"method": "mean"}
    # load other options from model
    for option, value in model.options.items():
        if not re.search('begin_line|end_line', option):
            defaults[option] = value
    for element in model.elements:
        if model.__dict__[element]:
            if re.search("meta", element, re.IGNORECASE):
                defaults["metadata"] = model.metadata.copy()
            else:
                defaults[element] = model.__dict__[element][:]
    # We need to preserve the frequency column some how
    collapse_options = {}
    for key, value in defaults.items():
        collapse_options[key] = value
    for key, value in options.items():
        collapse_options[key] = value
    unique_independent_variable_list = sorted(list(set(model[independent_column_name])))
    independent_variable_selector = model.column_names.index(independent_column_name)
    out_data = []
    for index, independent_variable in enumerate(unique_independent_variable_list):
        data_row = [x for x in model.data[:] if x[independent_variable_selector] == independent_variable]
        if re.search('mean|av', collapse_options["method"], re.IGNORECASE):
            new_row = np.mean(np.array(data_row), axis=0).tolist()
        elif re.search('median', collapse_options["method"], re.IGNORECASE):
            new_row = np.median(np.array(data_row), axis=0).tolist()
        elif re.search('geometric', collapse_options["method"], re.IGNORECASE):
            new_row = gmean(np.array(data_row), axis=0).tolist()
        elif re.search('st', collapse_options["method"], re.IGNORECASE):
            new_row = np.std(np.array(data_row), axis=0).tolist()
        elif re.search('var', collapse_options["method"], re.IGNORECASE):
            new_row = np.var(np.array(data_row), axis=0, dtype=np.float64).tolist()
        elif re.search('rms', collapse_options["method"], re.IGNORECASE):
            new_row = np.sqrt(np.mean(np.square(np.array(data_row)), axis=0, dtype=np.float64)).tolist()
        elif re.search('rss', collapse_options["method"], re.IGNORECASE):
            new_row = np.sqrt(np.sum(np.square(np.array(data_row)), axis=0, dtype=np.float64)).tolist()
        elif re.search('mad', collapse_options["method"], re.IGNORECASE):
            new_row = mad(np.array(data_row), axis=0).tolist()
        new_row[independent_variable_selector]=independent_variable
        out_data.append(new_row)

    collapse_options["data"] = out_data

    if collapse_options["specific_descriptor"]:
        collapse_options["specific_descriptor"] = collapse_options["method"] + "_" + \
                                                  collapse_options["specific_descriptor"]
    resulting_model = AsciiDataTable(None, **collapse_options)
    return resulting_model
Ejemplo n.º 32
0
def correct_counts(counts_file_name):
    """Calculate corrected reads depth values over bins

    Args:
        counts_file_name (str): name of tab delimited file with chromosome, start, stop, GC, counts
                   for bins that have the same size

    Returns:
        data (pandas.DataFrame): data frame with data from input file and corrected counts values in column 'updated'

    """
    if not os.path.isfile(counts_file_name):
        raise IOError("File %s does not exists" % counts_file_name)

    data = pd.read_table(counts_file_name,
                         names=['chrom', 'start', 'stop', 'gc', 'counts'],
                         dtype={
                             'chrom': np.str,
                             'start': np.int32,
                             'stop': np.int32,
                             'gc': np.int32,
                             'counts': np.int32
                         })

    logging.debug("Input file %s loaded" % counts_file_name)

    if data.empty:
        logging.debug("Input file is empty!")
        return data

    bin_length = data.apply(lambda row: row['stop'] - row['start'], axis=1)
    if len(bin_length.unique()) != 1:
        logging.debug("Warning: Bins of different sizes!")

    median_all = np.median(data['counts'])
    mad_all = mad(data['counts'])

    median_gc_bins = {
    }  # gc count in bin: median over all bins with the same gc count
    for gc in data.gc.unique():
        median_gc_bins[gc] = float(
            np.median(data.loc[data['gc'] == gc]['counts']))

    logging.debug("Median of counts over all bins: %s" % median_all)
    logging.debug("MAD of counts over all bins: %s" % mad_all)
    logging.debug("Calculating corrected reads depth values over bins")

    data['updated'] = data.apply(
        lambda row: row['counts'] * median_all / median_gc_bins[row['gc']]
        if median_gc_bins[row['gc']] != 0 else 1.0,
        axis=1)

    return data
Ejemplo n.º 33
0
def qqplot(x, loc='mean', scale='std'):
    if loc == 'mean':
        mu_hat = np.mean(x)
    elif loc == 'median':
        mu_hat = np.median(x)

    if scale == 'std':
        sigma_hat = np.std(x)
    elif scale == 'mad':
        sigma_hat = mad(x)

    sm.qqplot(np.array(x), loc=mu_hat, scale=sigma_hat, line='s')
Ejemplo n.º 34
0
def remove_outliers(t, delta, mad_factor=3):
    """
    :param t: an instance of pd.Series
    :param delta: parameter for l1tf function
    """
    filtered_t = l1tf(t, delta)

    diff = t.values - np.asarray(filtered_t).squeeze()
    t = t.copy()
    t[np.abs(diff - np.median(diff)) > mad_factor * mad(diff)] = np.nan

    t = t.fillna(method='ffill').fillna(method='bfill')
    return t
Ejemplo n.º 35
0
def remove_outliers(t, delta, mad_factor=3):
    """
    :param t: an instance of pd.Series
    :param delta: parameter for l1tf function
    """
    filtered_t = l1tf(t, delta)

    diff = t.values - np.asarray(filtered_t).squeeze()
    t = t.copy()
    t[np.abs(diff - np.median(diff)) > mad_factor * mad(diff)] = np.nan

    t = t.fillna(method='ffill').fillna(method='bfill')
    return t
Ejemplo n.º 36
0
 def _estimate_scale(self, resid):
     """
     Estimates the scale based on the option provided to the fit method.
     """
     if isinstance(self.scale_est, str):
         if self.scale_est.lower() == 'mad':
             return scale.mad(resid)
         if self.scale_est.lower() == 'stand_mad':
             return scale.stand_mad(resid)
     elif isinstance(self.scale_est, scale.HuberScale):
         return scale.hubers_scale(self.df_resid, self.nobs, resid)
     else:
         return scale.scale_est(self, resid)**2
Ejemplo n.º 37
0
def cjv(img, seg=None, wmmask=None, gmmask=None, wmlabel='wm', gmlabel='gm'):
    r"""
    Calculate the :abbr:`CJV (coefficient of joint variation)`, a measure
    related to :abbr:`SNR (Signal-to-Noise Ratio)` and
    :abbr:`CNR (Contrast-to-Noise Ratio)` that is presented as a proxy for
    the :abbr:`INU (intensity non-uniformity)` artifact [Ganzetti2016]_.
    Lower is better.

    .. math::

        \text{CJV} = \frac{\sigma_\text{WM} + \sigma_\text{GM}}{|\mu_\text{WM} - \mu_\text{GM}|}.

    :param numpy.ndarray img: the input data
    :param numpy.ndarray wmmask: the white matter mask
    :param numpy.ndarray gmmask: the gray matter mask
    :return: the computed CJV


    """

    if seg is None and (wmmask is None or gmmask is None):
        raise RuntimeError('Masks or segmentation should be provided')

    if seg is not None:
        if isinstance(wmlabel, string_types):
            wmlabel = FSL_FAST_LABELS[wmlabel]
        if isinstance(gmlabel, string_types):
            gmlabel = FSL_FAST_LABELS[gmlabel]

        wmmask = np.zeros_like(seg)
        wmmask[seg == wmlabel] = 1
        gmmask = np.zeros_like(seg)
        gmmask[seg == gmlabel] = 1

    mu_wm = np.median(img[wmmask > .5])
    mu_gm = np.median(img[gmmask > .5])
    sigma_wm = mad(img[wmmask > .5])
    sigma_gm = mad(img[gmmask > .5])
    return float((sigma_wm + sigma_gm) / (mu_wm - mu_gm))
Ejemplo n.º 38
0
 def _estimate_scale(self, resid):
     """
     Estimates the scale based on the option provided to the fit method.
     """
     if isinstance(self.scale_est, str):
         if self.scale_est.lower() == 'mad':
             return scale.mad(resid)
         if self.scale_est.lower() == 'stand_mad':
             return scale.stand_mad(resid)
     elif isinstance(self.scale_est, scale.HuberScale):
         return self.scale_est(self.df_resid, self.nobs, resid)
     else:
         return scale.scale_est(self, resid)**2
Ejemplo n.º 39
0
Archivo: tf.py Proyecto: bugra/l1
def strip_outliers(original_signal, delta, mad_coef=3):
    """
    Based on l1 trend filtering, this function provides an endpoint
    """
    filtered_t = l1(original_signal, delta)

    diff = original_signal - filtered_t.squeeze()
    median_of_difference = np.median(diff)
    mad_of_difference = mad(diff)
    filtered_signal = original_signal.copy()
    threshold = mad_coef * mad_of_difference
    filtered_signal[np.abs(diff - median_of_difference) > threshold] = np.nan
    #filtered_signal = pd.Series(filtered_signal).fillna(method='ffill').fillna(method='bfill')

    return filtered_signal
Ejemplo n.º 40
0
def measure_one_background(image, center, aperRad, metric, apMethod='exact', bgMethod='circle'):
    """Class methods are similar to regular functions.

    Note:
        Do not include the `self` parameter in the ``Args`` section.

    Args:
        param1: The first parameter.
        param2: The second parameter.

    Returns:
        True if successful, False otherwise.

    """
    
    if np.ndim(aperRad) == 0:
        aperture  = CircularAperture(center, aperRad)
        aperture  = aperture.to_mask(method=apMethod)[0]    # list of ApertureMask objects (one for each position)
        aperture  = ~aperture.to_image(image).astype(bool) # inverse to keep 'outside' aperture
    else:
        innerRad, outerRad = aperRad
        
        innerAperture   = CircularAperture(center, innerRad)
        outerAperture   = CircularAperture(center, outerRad)
        
        inner_aper_mask = innerAperture.to_mask(method=method)[0]
        inner_aper_mask = inner_aper_mask.to_image(image.shape).astype(bool)
    
        outer_aper_mask = outerAperture.to_mask(method=method)[0]
        outer_aper_mask = outer_aper_mask.to_image(image.shape).astype(bool)     
        
        aperture        = (~inner_aper_mask)*outer_aper_mask
    
    if bgMethod == 'median':
        medFrame  = median(image[aperture])
        madFrame  = scale.mad(image[aperture])
        
        medianMask= abs(image - medFrame) < nSig*madFrame
        
        aperture  = medianMask*aperture
    
    if bgMethod == 'kde':
        kdeFrame = kde.KDEUnivariate(image[aperture].ravel())
        kdeFrame.fit()
        
        return kdeFrame.support[kdeFrame.density.argmax()]
    
    return metric(image[aperture])
Ejemplo n.º 41
0
def normalize_data(data, out_file, mad=False,
                   mad_file=os.path.join('tables', 'full_mad_genes.tsv'),
                   output=True,
                   method='minmax'):
    """
    Filters unidentified genes and normalizes each input gene expression matrix

    Arguments:
    :param data: pandas DataFrame genes as rows and sample IDs as columns
    :param out_file: the file name to write normalized matrix
    :param mad: boolean indicating if MAD genes should be output to file
    :param mad_file: the file name to write mad genes
    :param method: the type of scaling to perform (defaults to minmax)

    Output:
    Writes normalized matrix (if output=True) and mad genes to file
    (if mad=True); returns the normalized matrix if output=False
    """

    # Drop all row names with unidentified gene
    data = data[-data.index.str.contains('?', regex=False)]

    # Sort data by gene name
    data = data.sort_index()

    # Zero-one normalize
    if method == 'minmax':
        min_max_scaler = preprocessing.MinMaxScaler()
        data_normalize = min_max_scaler.fit_transform(data.T)

    elif method == 'zscore':
        data_normalize = preprocessing.scale(data.T, axis=0)

    data_normalize = pd.DataFrame(data_normalize, index=data.columns,
                                  columns=data.index).T
    # Write to file
    if output:
        data_normalize.to_csv(out_file, sep='\t', header=True, index=True)
    else:
        return data_normalize

    # Write out MAD genes
    if mad:
        all_mad_genes = scale.mad(data_normalize, c=1, axis=1)
        all_mad_genes = pd.Series(all_mad_genes,
                                  index=data_normalize.index.values)
        all_mad_genes = all_mad_genes.sort_values(ascending=False)
        all_mad_genes.to_csv(mad_file, sep='\t', header=False)
Ejemplo n.º 42
0
def calc_robust_median_diff(in4d):
    """Calculates the robust median fo slice to slice diffs"""
    img = ni.load(in4d)
    dat = img.get_data()
    shape = dat.shape
    tdat = dat.T
    tdat.shape = (shape[-1], np.prod(shape[:-1]))
    dat_diff = tdat[1:,:] - tdat[:-1,:]
    mad = scale.mad(dat_diff, axis=1)
    mad_std = (mad - mad.mean())/ mad.std()
    plt.plot(mad_std, 'ro-')
    plt.title('Robust Frame difference median')
    plt.grid()
    outfile = fname_presuffix(in4d, prefix='Robust_framediff_median',
                              suffix = '.png', use_ext=False)
    plt.savefig(outfile)
    print 'Saved ', outfile
    plt.close()
Ejemplo n.º 43
0
 def test_mad(self):
     m = scale.mad(self.X)
     assert_equal(m.shape, (10,))
Ejemplo n.º 44
0
        if spline[0] == ul:

            rt_avg.append(int(spline[5]))

            if (len(rt_avg)) > 0:
                average = np.mean(rt_avg)
                corr_factor = int(spline[3]) / average
                break


    ################
    # remove outliars
    ################

    rt_mad = mad(retweet_counts)
    rt_median = np.median(retweet_counts)
    rt_top_thresh = round((rt_median + (50 * rt_mad)), 2)  # 5 is just an arbitrary number we choose

    print("MAD for rt is " + str(rt_mad))
    print("Median for rt is " + str(rt_median))
    print("Top threshold for rt is " + str(rt_top_thresh))

    rt_outliers = []
    retweets = []
    index = []

    for i, rc in enumerate(retweet_counts):

        if rc <= rt_top_thresh:
            retweets.append(rc)
Ejemplo n.º 45
0
    def plot_histogram_anew(self):

        val = self.create_category_lists_anew()[0]
        arou = self.create_category_lists_anew()[1]
        dom = self.create_category_lists_anew()[2]

        print ()
        print ("Max value for valence is "+str(max(val)))
        print ("Max value for arousal is "+str(max(arou)))
        print ("Max value for dominance is "+str(max(dom)))
        print ()

        ###############
        # Valence: get median absolute deviation to remove outliers
        ###############

        val_mad = mad(val)
        val_median = np.median(val)
        val_top_thresh = round((val_median + (5 * val_mad)),2) # 5 is just an arbitrary number we choose

        print ("MAD for valence is "+str(val_mad))
        print ("Median for valence is "+str(val_median))
        print ("Top threshold for valence is "+str(val_top_thresh))

        val_outliers = []
        valence = []

        for v in val:

            if v <= val_top_thresh:
                valence.append(v)

            else:
                val_outliers.append(v)

        print ("Number of valence outliers is "+str(len(val_outliers)))

        plt.hist(valence,bins=30)
        plt.xlabel("Valence score")
        plt.ylabel("Number of posts")
        plt.show()

        ###############
        # Arousal: get median absolute deviation to remove outliers
        ###############

        arou_mad = mad(arou)
        arou_median = np.median(arou)
        arou_top_thresh = round((arou_median + (5 * arou_mad)),2) # 5 is just an arbitrary number we choose

        print ()
        print ("MAD for arousal is "+str(arou_mad))
        print ("Median for arousal is "+str(arou_median))
        print ("Top threshold for arousal is "+str(arou_top_thresh))

        arou_outliers = []
        arousal = []

        for a in arou:

            if a <= arou_top_thresh:
                arousal.append(a)

            else:
                arou_outliers.append(a)

        print ("Number of arousal outliers is "+str(len(arou_outliers)))

        plt.hist(arousal,bins=30)
        plt.xlabel("Arousal score")
        plt.ylabel("Number of posts")
        plt.show()

        ###############
        # Dominance: get median absolute deviation to remove outliers
        ###############

        dom_mad = mad(dom)
        dom_median = np.median(dom)
        dom_top_thresh = round((dom_median + (5 * dom_mad)),2) # 5 is just an arbitrary number we choose

        print ()
        print ("MAD for dominance is "+str(dom_mad))
        print ("Median for dominance is "+str(dom_median))
        print ("Top threshold for dominance is "+str(dom_top_thresh))

        dom_outliers = []
        dominance = []

        for d in dom:

            if d <= dom_top_thresh:
                dominance.append(d)

            else:
                dom_outliers.append(d)

        print ("Number of dominance outliers is "+str(len(dom_outliers)))

        plt.hist(dominance,bins=30)
        plt.xlabel("Dominance score")
        plt.ylabel("Number of posts")
        plt.show()
Ejemplo n.º 46
0
 def _prefilter_windows_gtest(self, df):
     ''' This function filters the windows in a data frame by minimum
         expression based on a MAD cutoff and requires higher expression
         in the experiment libs than in the controls
     '''
     # remove windows where not all experiment libs show expression:
     #   expression = 1/size_factor ( = pseudocount)
     print("Removing windows where not all experiment libs show "
           "expression from DataFrame with {} rows...".format(len(df)),
           flush=True)
     t_start = time()
     for exp_lib in self._exp_lib_list:
         exp_lib_zero_count = 1/self._size_factors[exp_lib]
         df = df.loc[(df.loc[:, exp_lib] > exp_lib_zero_count), :]
     t_end = time()
     print("Removal took {} seconds. DataFrame contains now {} rows.".
           format((t_end-t_start), len(df)), flush=True)
     if df.empty:
         return df
     # minimum expression cutoff based on mean over experiment libraries
     print("Removing windows based on mad cutoff from DataFrame "
           "with {} rows...".format(len(df)), flush=True)
     t_start = time()
     median_abs_dev_from_zero = mad(df.loc[:, self._exp_lib_list].mean(
         axis=1), center=0.0)
     min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
     print("Minimal window expression based on mean over RIP/CLIP "
           "libraries: {} (MAD from zero: {})".format(
               min_expr, median_abs_dev_from_zero), flush=True)
     df = df.loc[df.loc[:, self._exp_lib_list].mean(axis=1) >= min_expr, :]
     t_end = time()
     print("Removal took {} seconds. DataFrame contains now {} rows.".
           format((t_end-t_start), len(df)), flush=True)
     if df.empty:
         return df
     print("Removing windows where experiment expression is lower than "
           "control expression from DataFrame with {} rows...".format(
               len(df)), flush=True)
     t_start = time()
     if self._pairwise_replicates:
         # experiment expression must be larger than respective control
         # for each library pair
         for exp_lib, ctr_lib in zip(
                 self._exp_lib_list, self._ctr_lib_list):
             df = df.loc[(df.loc[:, exp_lib] > df.loc[:, ctr_lib]), :]
     else:
         # minimum experiment expression larger than maximum
         # control expression
         df = df.loc[df.loc[:, self._exp_lib_list].min(
             axis=1) > df.loc[:, self._ctr_lib_list].max(axis=1), :]
     t_end = time()
     print("Removal took {} seconds. DataFrame contains now {} rows.".
           format((t_end-t_start), len(df)), flush=True)
     if df.empty:
         return df
     # minimum fold change
     print("Removing windows based on minimum fold change from DataFrame "
           "with {} rows...".format(len(df)), flush=True)
     t_start = time()
     df = df.query('fold_change >= @self._fc_cutoff')
     t_end = time()
     print("Removal took {} seconds. DataFrame contains now {} rows.".
           format((t_end-t_start), len(df)), flush=True)
     return df
Ejemplo n.º 47
0
Archivo: lime.py Proyecto: jklynch/lime
    def do(self):
        print('testing SNP {} {}'.format(self.snp_with_rsq_df.GENE[0], self.snp_with_rsq_df.ID[0]))

        if self.permutation_method == 'no_permutation':
            y_labels = self.aligned_snp_df.values.flatten()
        elif self.permutation_method == 'uniform_permutation':
            y_labels = self.uniform_snp_permutation()
        elif self.permutation_method == 'group_permutation':
            y_labels = self.group_snp_permutation()
        else:
            raise Exception('unknown permutation_method {}'.format(self.permutation_method))

        self.cv_score_list = self.score_cv(y_labels)
        if len(self.cv_score_list) > self.cv_count:
            print('{} {} {} {} has fewer scores than expected: {}'.format(
                self.snp_with_rsq_df.CHROM[0],
                self.snp_with_rsq_df.POS[0],
                self.snp_with_rsq_df.GENE[0],
                self.snp_with_rsq_df.ID[0],
                len(self.cv_score_list)
            ))
        validation_score_array = np.asarray(self.cv_score_list)

        (rsq_mean_pibs95ci_lo, rsq_mean_pibs95ci_hi) = bootstrap_ci_lo_hi(
            validation_score_array, alpha=0.05, method='pi'
        )
        (rsq_mean_pibs99ci_lo, rsq_mean_pibs99ci_hi) = bootstrap_ci_lo_hi(
            validation_score_array, alpha=0.01, method='pi'
        )

        rsq_median = np.median(validation_score_array)
        (rsq_median_pibs95ci_lo, rsq_median_pibs95ci_hi) = bootstrap_ci_lo_hi(
            validation_score_array,
            alpha=0.05,
            statistic=np.median,
            method='pi'
        )
        (rsq_median_pibs99ci_lo, rsq_median_pibs99ci_hi) = bootstrap_ci_lo_hi(
            validation_score_array,
            alpha=0.01,
            statistic=np.median,
            method='pi'
        )

        rsq_mean = np.mean(validation_score_array)

        self.snp_with_rsq_df.loc[0, 'rsq_mean'] = rsq_mean
        self.snp_with_rsq_df.loc[0, 'rsq_std'] = np.std(validation_score_array)
        self.snp_with_rsq_df.loc[0, 'rsq_sem'] = scipy.stats.sem(validation_score_array)
        self.snp_with_rsq_df.loc[0, 'rsq_pibsp_mean_95ci_lo'] = rsq_mean_pibs95ci_lo
        self.snp_with_rsq_df.loc[0, 'rsq_pibsp_mean_95ci_hi'] = rsq_mean_pibs95ci_hi
        self.snp_with_rsq_df.loc[0, 'rsq_pibsp_mean_99ci_lo'] = rsq_mean_pibs99ci_lo
        self.snp_with_rsq_df.loc[0, 'rsq_pibsp_mean_99ci_hi'] = rsq_mean_pibs99ci_hi
        self.snp_with_rsq_df.loc[0, 'rsq_median'] = rsq_median
        self.snp_with_rsq_df.loc[0, 'rsq_mad'] = mad(validation_score_array)
        self.snp_with_rsq_df.loc[0, 'rsq_pibsp_median_95ci_lo'] = rsq_median_pibs95ci_lo
        self.snp_with_rsq_df.loc[0, 'rsq_pibsp_median_95ci_hi'] = rsq_median_pibs95ci_hi
        self.snp_with_rsq_df.loc[0, 'rsq_pibsp_median_99ci_lo'] = rsq_median_pibs99ci_lo
        self.snp_with_rsq_df.loc[0, 'rsq_pibsp_median_99ci_hi'] = rsq_median_pibs99ci_hi
        self.snp_with_rsq_df.loc[0, 'cv_skewness'] = scipy.stats.skew(validation_score_array)
        self.snp_with_rsq_df.loc[0, 'cv_kurtosis'] = scipy.stats.kurtosis(validation_score_array)
        print('{} {} rsq_mean 95% (pi)  : {:6.4f} <-- {:6.4f} --> {:6.4f}'.format(
            self.snp_with_rsq_df.GENE[0], self.snp_with_rsq_df.ID[0],
            rsq_mean_pibs95ci_lo, rsq_mean, rsq_mean_pibs95ci_hi
        ))
        print('{} {} rsq_median 95% (pi): {:6.4f} <-- {:6.4f} --> {:6.4f}'.format(
            self.snp_with_rsq_df.GENE[0], self.snp_with_rsq_df.ID[0],
            rsq_median_pibs95ci_lo, rsq_median, rsq_median_pibs95ci_hi
        ))
        print('{} {} rsq_mean 99% (pi)  : {:6.4f} <-- {:6.4f} --> {:6.4f}'.format(
            self.snp_with_rsq_df.GENE[0], self.snp_with_rsq_df.ID[0],
            rsq_mean_pibs99ci_lo, rsq_mean, rsq_mean_pibs99ci_hi
        ))
        print('{} {} rsq_median 99% (pi): {:6.4f} <-- {:6.4f} --> {:6.4f}'.format(
            self.snp_with_rsq_df.GENE[0], self.snp_with_rsq_df.ID[0],
            rsq_median_pibs99ci_lo, rsq_median, rsq_median_pibs99ci_hi
        ))
Ejemplo n.º 48
0
def fs_mad(x, y):
    """    
    Get the median absolute deviation (MAD) for each column of x
    """
    scores = mad(x) 
    return scores, np.array([np.NaN]*len(scores))
Ejemplo n.º 49
0
 def test_axis0(self):
     m = scale.mad(self.X, axis=0)
     assert_equal(m.shape, (10,30))
Ejemplo n.º 50
0
 def test_mad(self):
     assert_almost_equal(scale.mad(self.chem), 0.52632, DECIMAL)
Ejemplo n.º 51
0
 def test_axis1(self):
     m = scale.mad(self.X, axis=1)
     assert_equal(m.shape, (40,30))
Ejemplo n.º 52
0
    def _run_interface(self, runtime):
        from scipy import ndimage as sim

        fmap_nii = nb.load(self.inputs.in_file)
        data = np.squeeze(fmap_nii.get_data().astype(np.float32))

        # Despike / denoise (no-mask)
        if self.inputs.despike:
            data = _despike2d(data, self.inputs.despike_threshold)

        mask = None
        if isdefined(self.inputs.in_mask):
            masknii = nb.load(self.inputs.in_mask)
            mask = masknii.get_data().astype(np.uint8)

            # Dilate mask
            if self.inputs.mask_erode > 0:
                struc = sim.iterate_structure(sim.generate_binary_structure(3, 2), 1)
                mask = sim.binary_erosion(
                    mask, struc,
                    iterations=self.inputs.mask_erode
                    ).astype(np.uint8)  # pylint: disable=no-member

        self._results['out_file'] = genfname(self.inputs.in_file, suffix='enh')
        datanii = nb.Nifti1Image(data, fmap_nii.affine, fmap_nii.header)

        if self.inputs.unwrap:
            data = _unwrap(data, self.inputs.in_magnitude, mask)
            self._results['out_unwrapped'] = genfname(self.inputs.in_file, suffix='unwrap')
            nb.Nifti1Image(data, fmap_nii.affine, fmap_nii.header).to_filename(
                self._results['out_unwrapped'])

        if not self.inputs.bspline_smooth:
            datanii.to_filename(self._results['out_file'])
            return runtime
        else:
            from fmriprep.utils import bspline as fbsp
            from statsmodels.robust.scale import mad

            # Fit BSplines (coarse)
            bspobj = fbsp.BSplineFieldmap(datanii, weights=mask,
                                          njobs=self.inputs.njobs)
            bspobj.fit()
            smoothed1 = bspobj.get_smoothed()

            # Manipulate the difference map
            diffmap = data - smoothed1.get_data()
            sderror = mad(diffmap[mask > 0])
            LOGGER.info('SD of error after B-Spline fitting is %f', sderror)
            errormask = np.zeros_like(diffmap)
            errormask[np.abs(diffmap) > (10 * sderror)] = 1
            errormask *= mask

            nslices = 0
            try:
                errorslice = np.squeeze(np.argwhere(errormask.sum(0).sum(0) > 0))
                nslices = errorslice[-1] - errorslice[0]
            except IndexError:  # mask is empty, do not refine
                pass

            if nslices > 1:
                diffmapmsk = mask[..., errorslice[0]:errorslice[-1]]
                diffmapnii = nb.Nifti1Image(
                    diffmap[..., errorslice[0]:errorslice[-1]] * diffmapmsk,
                    datanii.affine, datanii.header)

                bspobj2 = fbsp.BSplineFieldmap(diffmapnii, knots_zooms=[24., 24., 4.],
                                               njobs=self.inputs.njobs)
                bspobj2.fit()
                smoothed2 = bspobj2.get_smoothed().get_data()

                final = smoothed1.get_data().copy()
                final[..., errorslice[0]:errorslice[-1]] += smoothed2
            else:
                final = smoothed1.get_data()

            nb.Nifti1Image(final, datanii.affine, datanii.header).to_filename(
                self._results['out_file'])

        return runtime
Ejemplo n.º 53
0
rc("font", **font)

fig_hist = plt.figure(figsize=(18, 12))

colours = ["#AE70ED", "#FFB60B", "#62A9FF", "#59DF00"]

##Sometimes if the table contains single sources, there is no SI fit, so column contains NaNs
## atpy reads as '--' so need to avoid these
sources_SIs = [source for source in sources if source.SI != "--"]
SIs = [float(source.SI) for source in sources_SIs]

##Plot all of the SIs together
##-----------------------------------------------------------------------------------------------------------------------
ax1 = fig_hist.add_subplot(221)
plot_by_kde(ax1, SIs, "k", 3.0, "All fits (%d sources)" % len(SIs), "-")
mad_all = mad(np.array(SIs))
med_all = np.median(np.array(SIs))
ax1.axvline(med_all, color="k", linestyle="--", linewidth=2.0, label="Median %.2f$\pm$%.2f" % (med_all, mad_all))

##Compare the good fits to the bad fits
##-----------------------------------------------------------------------------------------------------------------------
ax2 = fig_hist.add_subplot(222)
good_fit_SIs = [float(source.SI) for source in sources_SIs if float(source.low_resid) == 0]
bad_fit_SIs = [float(source.SI) for source in sources_SIs if float(source.low_resid) == 1]

plot_by_kde(ax2, good_fit_SIs, colours[0], 3.0, "$\chi^2_{red}<=2.0$\n(%d sources)" % len(good_fit_SIs), "-")
plot_by_kde(ax2, bad_fit_SIs, colours[3], 3.0, "$\chi^2_{red}>2.0$\n(%d sources)" % len(bad_fit_SIs), "--")

##Compare the matches with just one matched frequency to the base catalogue, to those
# with multiple frequencies matched
##-----------------------------------------------------------------------------------------------------------------------
Ejemplo n.º 54
0
 def test_axis2(self):
     m = scale.mad(self.X, axis=2)
     assert_equal(m.shape, (40,10))
Ejemplo n.º 55
0
 def test_axisneg1(self):
     m = scale.mad(self.X, axis=-1)
     assert_equal(m.shape, (40,10))
Ejemplo n.º 56
0
def detect_anoms(data, k=0.49, alpha=0.05, num_obs_per_period=None,
                 use_decomp=True, one_tail=True,
                 upper_tail=True, verbose=False):
    """
    # Detects anomalies in a time series using S-H-ESD.
    #
    # Args:
    #	 data: Time series to perform anomaly detection on.
    #	 k: Maximum number of anomalies that S-H-ESD will detect as a percentage of the data.
    #	 alpha: The level of statistical significance with which to accept or reject anomalies.
    #	 num_obs_per_period: Defines the number of observations in a single period, and used during seasonal decomposition.
    #	 use_decomp: Use seasonal decomposition during anomaly detection.
    #	 one_tail: If TRUE only positive or negative going anomalies are detected depending on if upper_tail is TRUE or FALSE.
    #	 upper_tail: If TRUE and one_tail is also TRUE, detect only positive going (right-tailed) anomalies. If FALSE and one_tail is TRUE, only detect negative (left-tailed) anomalies.
    #	 verbose: Additionally printing for debugging.
    # Returns:
    #   A dictionary containing the anomalies (anoms) and decomposition components (stl).
    """
    if num_obs_per_period is None:
        raise ValueError("must supply period length for time series decomposition")

    if list(data.columns.values) != ["timestamp", "value"]:
        data.columns = ["timestamp", "value"]

    num_obs = len(data)

    # Check to make sure we have at least two periods worth of data for anomaly context
    if num_obs < num_obs_per_period * 2:
        raise ValueError("Anom detection needs at least 2 periods worth of data")

    # Check if our timestamps are posix
    posix_timestamp = data.dtypes[0].type is np.datetime64

    # run length encode result of isnull, check for internal nulls
    if (len(map(lambda x: x[0], list(groupby(ps.isnull(
            ps.concat([ps.Series([np.nan]),
                       data.value,
                       ps.Series([np.nan])])))))) > 3):
        raise ValueError("Data contains non-leading NAs. We suggest replacing NAs with interpolated values (see na.approx in Zoo package).")
    else:
        data = data.dropna()

    # -- Step 1: Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.

    data = data.set_index('timestamp')

    if not isinstance(data.index, ps.Int64Index):
        resample_period = {
            1440: 'T',
            24: 'H',
            7: 'D'
        }
        resample_period = resample_period.get(num_obs_per_period)
        if not resample_period:
            raise ValueError('Unsupported resample period: %d' % resample_period)
        data = data.resample(resample_period)


    decomp = stl(data.value, "periodic", np=num_obs_per_period)

    # Remove the seasonal component, and the median of the data to create the univariate remainder
    d = {
        'timestamp': data.index,
        'value': data.value - decomp['seasonal'] - data.value.median()
    }
    data = ps.DataFrame(d)

    p = {
        'timestamp': decomp.index,
        'value': (decomp['trend'] + decomp['seasonal']).truncate().convert_objects(convert_numeric=True)
    }
    data_decomp = ps.DataFrame(p)

    # Maximum number of outliers that S-H-ESD can detect (e.g. 49% of data)
    max_outliers = int(num_obs * k)

    if max_outliers == 0:
        raise ValueError("With longterm=TRUE, AnomalyDetection splits the data into 2 week periods by default. You have %d observations in a period, which is too few. Set a higher piecewise_median_period_weeks." % num_obs)

    ## Define values and vectors.
    n = len(data.timestamp)
    R_idx = range(max_outliers)

    num_anoms = 0

    # Compute test statistic until r=max_outliers values have been
    # removed from the sample.
    for i in range(1, max_outliers + 1):
        if one_tail:
            if upper_tail:
                ares = data.value - data.value.median()
            else:
                ares = data.value.median() - data.value
        else:
            ares = (data.value - data.value.median()).abs()

        # protect against constant time series
        data_sigma = mad(data.value)
        if data_sigma == 0:
            break

        ares = ares / float(data_sigma)

        R = ares.max()

        temp_max_idx = ares[ares == R].index.tolist()[0]

        R_idx[i - 1] = temp_max_idx

        data = data[data.index != R_idx[i - 1]]

        if one_tail:
            p = 1 - alpha / float(n - i + 1)
        else:
            p = 1 - alpha / float(2 * (n - i + 1))

        t = student_t.ppf(p, (n - i - 1))
        lam = t * (n - i) / float(sqrt((n - i - 1 + t**2) * (n - i + 1)))

        if R > lam:
            num_anoms = i

    if num_anoms > 0:
        R_idx = R_idx[:num_anoms]
    else:
        R_idx = None

    return {
        'anoms': R_idx,
        'stl': data_decomp
    }
Ejemplo n.º 57
0
 def test_mad_center(self):
     n = scale.mad(self.X, center=0)
     assert_equal(n.shape, (10,))