def unimodality_dip_test(path: str, plot_show=False) -> bool: ''' #http://www.nicprice.net/diptest/Hartigan_1985_AnnalStat.pdf #https://github.com/BenjaminDoran/unidip Given the image and conduct dip test to see whether it's unimodal or not. @path: image path @plot_show: see whether plot the histogram or not ''' img = cv2.imread(path, 0) img_array = img.ravel() #input an array #return True if its unimodal distributed data = np.msort(img_array) #the probability of unimodal uni_prob = dip.diptst(data)[1] if uni_prob > 0.5: #print(f'This image is unimodel distributed with probability of {uni_prob*100:.2f} %') unimodality = True else: #print(f'This image is at least bimodel distributed with probability of {(1-uni_prob)*100:.2f} %') unimodality = False if plot_show: plt.figure() sns.distplot(img.ravel(), bins=256, kde=True, hist=True) plt.title('Histogram of the image') plt.show() return unimodality
def test_dip(data, alpha = 0.05, verbose = True)->bool: import unidip.dip as dip # sort data data = np.msort(data) # test stat, p, _ = dip.diptst(data) # display if verbose: print('stat=%.3f, p=%.3f' % (stat, p)) if p > 0.05: print('Probably unimodal') else: print('Probably not unimodal.')
def _get_full_interval(self, mod_int): """ Expands discovered intervals When looking at unimodal data the dip test tends to return a very narrow interval, which can lead to conflicts later. This tends to happen after recursing left or right. Our solution, taken from the original unidip, is to mirror the data such that it is bimodal. We are then able to fully capture the mode, and return the full mode from the original data. """ dat = self.dat[mod_int[0]:mod_int[1]] ldat = self._mirror_data(dat, left=True) ldip = diptst(ldat, self.is_hist, self.ntrials) rdat = self._mirror_data(dat, left=False) rdip = diptst(rdat, self.is_hist, self.ntrials) if ldip[0] > rdip[0]: full_indxs = self._un_mirror_idxs(ldip[2], len(dat), mod_int, True) else: full_indxs = self._un_mirror_idxs(rdip[2], len(dat), mod_int, False) return tuple(full_indxs)
def dip_test(properties, cluster_members, feature=3): data = [] for ii in range(cluster_members): data.append(float(properties[ii][feature])) data = np.array(data) data = np.msort(data) intervals = dip.diptst(data) t_range = np.linspace(0, 0.15, 200) kde = gaussian_kde(data) plt.plot(t_range, kde(t_range)/100) plt.grid() plt.title('Pick PDF' + '\n p_value = ' + str(intervals[1])) plt.show()
def dip_test(properties, cluster_members, feature=3): data = [] outliers = [u'121010', u'091110', u'091109', u'091106'] for ii in range(cluster_members): if not properties[ii][0] in outliers: data.append(float(properties[ii][feature])) data = np.array(data) data = np.msort(data) intervals = dip.diptst(data) t_range = np.linspace(0, 0.2, 200) kde = gaussian_kde(data) plt.plot(t_range, kde(t_range) / 100) plt.grid() plt.title('Distance PDF' + '\n p_value = ' + str(intervals[1])) plt.show()
def MDplot(Data, Names=None, Ordering='Default', Scaling=None, Fill='darkblue', RobustGaussian=True, GaussianColor='magenta', Gaussian_lwd=1.5, BoxPlot=False, BoxColor='darkred', MDscaling='width', LineColor='black', LineSize=0.01, QuantityThreshold=40, UniqueValuesThreshold=12, SampleSize=500000, SizeOfJitteredPoints=1, OnlyPlotOutput=True, ValueColumn=None, ClassColumn=None): """ Plots a mirrored density plot for each numeric column Args: Data (dataframe): dataframe containing data. Each column is one variable (wide table format, for long table format see ValueColumn and ClassColumn) Names (list): list of column names (will be used if data is not a dataframe) Ordering (str): 'Default', 'Columnwise', 'Alphabetical' or 'Statistics' Scaling (str): scaling method, one of: Percentalize, CompleteRobust, Robust, Log Fill (str): color of MD-Plot RobustGaussian (bool): draw a gaussian distribution if column is gaussian GaussianColor (str): color for gaussian distribution Gaussian_lwd (float): line width of gaussian distribution BoxPlot (bool): draw box-plot BoxColor (str): color for box-plots MDscaling (str): scale of ggplot violin LineSize (float): line width of ggplot violin QuantityThreshold (int): minimal number of rows UniqueValuesThreshold (int): minimal number of unique values per column SampleSize (int): number of samples used if number of rows is larger than SampleSize OnlyPlotOutput (bool): if True than returning only ggplot object, if False than returning dictionary containing ggplot object and additional infos ValueColumn (str): name of the column of values to be plotted (data in long table format) ClassColumn (str): name of the column with class identifiers for the value column (data in long table format) Returns: ggplot object or dictionary containing ggplot object and additional infos """ if not isinstance(Data, pd.DataFrame): try: if Names is not None: Data = pd.DataFrame(Data, columns=Names) else: Data = pd.DataFrame(Data) lstCols = list(Data.columns) dctCols = {} for strCol in lstCols: dctCols[strCol] = "C_" + str(strCol) Data = Data.rename(columns=dctCols) except: raise Exception("Data cannot be converted into pandas dataframe") else: Data = Data.reset_index(drop=True) if ValueColumn is not None and ClassColumn is not None: lstCols = list(Data.columns) if ValueColumn not in lstCols: raise Exception("ValueColumn not contained in dataframe") if ClassColumn not in lstCols: raise Exception("ClassColumn not contained in dataframe") lstClasses = list(Data[ClassColumn].unique()) DataWide = pd.DataFrame() for strClass in lstClasses: if len(DataWide) == 0: DataWide = Data[Data[ClassColumn] == strClass].copy()\ .reset_index(drop=True) DataWide = DataWide.rename(columns={ValueColumn: strClass}) DataWide = DataWide[[strClass]] else: dfTemp = Data[Data[ClassColumn] == strClass].copy()\ .reset_index(drop=True) dfTemp = dfTemp.rename(columns={ValueColumn: strClass}) dfTemp = dfTemp[[strClass]] DataWide = DataWide.join(dfTemp, how='outer') Data = DataWide.copy() lstCols = list(Data.columns) for strCol in lstCols: if not is_numeric_dtype(Data[strCol]): print("Deleting non numeric column: " + strCol) Data = Data.drop([strCol], axis=1) else: if abs(Data[strCol].sum()) == np.inf: print("Deleting infinite column: " + strCol) Data = Data.drop([strCol], axis=1) Data = Data.rename_axis("index", axis="index")\ .rename_axis("variable", axis="columns") dvariables = Data.shape[1] nCases = Data.shape[0] if nCases > SampleSize: print('Data has more cases than "SampleSize". Drawing a sample for ' 'faster computation. You can omit this by setting ' '"SampleSize=len(data)".') sampledIndex = np.sort( np.random.choice(list(Data.index), size=SampleSize, replace=False)) Data = Data.loc[sampledIndex] nPerVar = Data.apply(lambda x: len(x.dropna())) nUniquePerVar = Data.apply(lambda x: len(list(x.dropna().unique()))) # renaming columns to nonumeric names lstCols = list(Data.columns) dctCols = {} for strCol in lstCols: try: a = float(strCol) dctCols[strCol] = "C_" + str(strCol) except: dctCols[strCol] = str(strCol) Data = Data.rename(columns=dctCols) if Scaling == "Percentalize": Data = Data.apply(lambda x: 100 * (x - x.min()) / (x.max() - x.min())) if Scaling == "CompleteRobust": Data = robust_normalization(Data, centered=True, capped=True) if Scaling == "Robust": Data = robust_normalization(Data, centered=False, capped=False) if Scaling == "Log": Data = signed_log(Data, base="Ten") if RobustGaussian == True: RobustGaussian = False print("log with robust gaussian does not work, because mean and " "variance is not valid description for log normal data") #_______________________________________________Roboust Gaussian and Statistics if RobustGaussian == True or Ordering == "Statistics": Data = Data.applymap(lambda x: np.nan if abs(x) == np.inf else x) if nCases < 50: warnings.warn("Sample is maybe too small for statistical testing") factor = pd.Series([0.25, 0.75]).apply(lambda x: abs(norm.ppf(x)))\ .sum() std = Data.std() dfQuartile = Data.apply( lambda x: mquantiles(x, [0.25, 0.75], alphap=0.5, betap=0.5)) dfQuartile = dfQuartile.append(dfQuartile.loc[1] - dfQuartile.loc[0], ignore_index=True) dfQuartile.index = ["low", "hi", "iqr"] dfMinMax = Data.apply( lambda x: mquantiles(x, [0.001, 0.999], alphap=0.5, betap=0.5)) dfMinMax.index = ["min", "max"] shat = pd.Series() mhat = pd.Series() nonunimodal = pd.Series() skewed = pd.Series() bimodalprob = pd.Series() isuniformdist = pd.Series() nSample = max([10000, nCases]) normaldist = np.empty((nSample, dvariables)) normaldist[:] = np.nan normaldist = pd.DataFrame(normaldist, columns=lstCols) for strCol in lstCols: shat[strCol] = min( [std[strCol], dfQuartile[strCol].loc["iqr"] / factor]) mhat[strCol] = trim_mean(Data[strCol].dropna(), 0.1) if nCases > 45000 and nPerVar[strCol] > 8: # statistical testing does not work with to many cases sampledIndex = np.sort( np.random.choice(list(Data.index), size=45000, replace=False)) vec = Data[strCol].loc[sampledIndex] if nUniquePerVar[strCol] > UniqueValuesThreshold: nonunimodal[strCol] = dip.diptst(vec.dropna(), numt=100)[1] skewed[strCol] = skewtest(vec)[1] args = (dfMinMax[strCol].loc["min"], dfMinMax[strCol].loc["max"] \ - dfMinMax[strCol].loc["min"]) isuniformdist[strCol] = kstest(vec, "uniform", args)[1] bimodalprob[strCol] = bimodal(vec)["Bimodal"] else: print("Not enough unique values for statistical testing, " "thus output of testing is ignored.") nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 elif nPerVar[strCol] < 8: warnings.warn("Sample of finite values to small to calculate " "agostino.test or dip.test for " + strCol) nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 else: if nUniquePerVar[strCol] > UniqueValuesThreshold: nonunimodal[strCol] = dip.diptst(Data[strCol].dropna(), numt=100)[1] skewed[strCol] = skewtest(Data[strCol])[1] args = (dfMinMax[strCol].loc["min"], dfMinMax[strCol].loc["max"] \ - dfMinMax[strCol].loc["min"]) isuniformdist[strCol] = kstest(Data[strCol], "uniform", args)[1] bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"] else: print("Not enough unique values for statistical testing, " "thus output of testing is ignored.") nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 if isuniformdist[strCol] < 0.05 and nonunimodal[strCol] > 0.05 \ and skewed[strCol] > 0.05 and bimodalprob[strCol] < 0.05 \ and nPerVar[strCol] > QuantityThreshold \ and nUniquePerVar[strCol] > UniqueValuesThreshold: normaldist[strCol] = np.random.normal(mhat[strCol], shat[strCol], nSample) normaldist[strCol] = normaldist[strCol]\ .apply(lambda x: np.nan if x < Data[strCol].min() \ or x > Data[strCol].max() else x) nonunimodal[nonunimodal == 0] = 0.0000000001 skewed[skewed == 0] = 0.0000000001 effectStrength = (-10 * np.log(skewed) - 10 * np.log(nonunimodal)) / 2 #______________________________________________________________________Ordering if Ordering == "Default": bimodalprob = pd.Series() for strCol in lstCols: if nCases > 45000 and nPerVar[strCol] > 8: sampledIndex = np.sort( np.random.choice(list(Data.index), size=45000, replace=False)) vec = Data[strCol].loc[sampledIndex] bimodalprob[strCol] = bimodal(vec)["Bimodal"] elif nPerVar[strCol] < 8: bimodalprob[strCol] = 0 else: bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"] if len(list(bimodalprob.unique())) < 2 and dvariables > 1 \ and RobustGaussian == True: rangfolge = list(effectStrength.sort_values(ascending=False).index) print("Using statistics for ordering instead of default") else: rangfolge = list(bimodalprob.sort_values(ascending=False).index) if Ordering == "Columnwise": rangfolge = lstCols if Ordering == "Alphabetical": rangfolge = lstCols.copy() rangfolge.sort() if Ordering == "Statistics": rangfolge = list(effectStrength.sort_values(ascending=False).index) #________________________________________________________________Data Reshaping if nPerVar.min() < QuantityThreshold \ or nUniquePerVar.min() < UniqueValuesThreshold: warnings.warn("Some columns have less than " + str(QuantityThreshold) + " data points or less than " + str(UniqueValuesThreshold) + " unique values. Changing from MD-plot to Jitter-Plot " "for these columns.") dataDensity = Data.copy() mm = Data.median() for strCol in lstCols: if nPerVar[strCol] < QuantityThreshold \ or nUniquePerVar[strCol] < UniqueValuesThreshold: if mm[strCol] != 0: dataDensity[strCol] = mm[strCol] \ * np.random.uniform(-0.001, 0.001, nCases) + mm[strCol] else: dataDensity[strCol] = np.random.uniform( -0.001, 0.001, nCases) # Generates in the cases where pdf cannot be estimated a scatter plot dataJitter = dataDensity.copy() # Delete all scatters for features where distributions can be estimated for strCol in lstCols: if nPerVar[strCol] >= QuantityThreshold \ and nUniquePerVar[strCol] >= UniqueValuesThreshold: dataJitter[strCol] = np.nan #apply ordering dataframe = dataDensity[rangfolge].reset_index()\ .melt(id_vars=["index"]) else: dataframe = Data[rangfolge].reset_index().melt(id_vars=["index"]) dctCols = {"index": "ID", "variable": "Variables", "value": "Values"} dataframe = dataframe.rename(columns=dctCols) #______________________________________________________________________Plotting plot = p9.ggplot(dataframe, p9.aes(x="Variables", group="Variables", y="Values")) \ + p9.scale_x_discrete(limits=rangfolge) plot = plot + p9.geom_violin(stat = stat_pde_density(scale=MDscaling), fill=Fill, colour=LineColor, size=LineSize, trim=True) \ + p9.theme(axis_text_x=p9.element_text(rotation=90)) if nPerVar.min() < QuantityThreshold \ or nUniquePerVar.min() < UniqueValuesThreshold: dataframejitter = dataJitter[rangfolge].reset_index()\ .melt(id_vars=["index"]) dataframejitter = dataframejitter.rename(columns=dctCols) plot = plot + p9.geom_jitter( size=SizeOfJitteredPoints, data=dataframejitter, colour=LineColor, mapping=p9.aes(x="Variables", group="Variables", y="Values"), position=p9.position_jitter(0.15)) if RobustGaussian == True: dfTemp = normaldist[rangfolge].reset_index().melt(id_vars=["index"]) dfTemp = dfTemp.rename(columns=dctCols) if dfTemp["Values"].isnull().all() == False: plot = plot + p9.geom_violin( data=dfTemp, mapping=p9.aes(x="Variables", group="Variables", y="Values"), colour=GaussianColor, alpha=0, scale=MDscaling, size=Gaussian_lwd, na_rm=True, trim=True, fill=None, position="identity", width=1) if BoxPlot == True: plot = plot + p9.stat_boxplot(geom = "errorbar", width = 0.5, color=BoxColor) \ + p9.geom_boxplot(width=1, outlier_colour = None, alpha=0, fill='#ffffff', color=BoxColor, position="identity") if OnlyPlotOutput == True: return plot else: print(plot) return { "Ordering": rangfolge, "DataOrdered": Data[rangfolge], "ggplotObj": plot }
def _unidip(self, start, end, is_model, debug): """ Perform unidip algorithm on 1d array INPUT: start: 0, idx of first point in current slice end: len(data), idx of last number in current slice is_model: True, always starts as true ntrials: 100, number of trials in diptest debug: False, determines whether to plot the data at each recursion level RETURNS: list of tuples: each tuple containing the start and end indicies on the x axis. """ dat = self.dat[start:end] interval_idxs = list() _, pval, modidx = diptst(dat, self.is_hist, self.ntrials) if debug: # if plotting -> show intervals self.plot((start, end), [(start + modidx[0], start + modidx[1])]) # not enough data to count it as significant if pval is None: return [] # is unimodal, return interval elif pval > self.alpha: if is_model: interval_idxs.append((start, end - 1)) else: wideidx = self._get_full_interval((start, end)) interval_idxs.append((start + wideidx[0], start + wideidx[1])) return interval_idxs # recurse into model interval rmidx = self._unidip(start + modidx[0], start + modidx[1], True, debug) # add returned intervals to our collection interval_idxs += rmidx # undo offset to get correct indices to data in recursion layer subd = list( map(lambda t: (t[0] - start, t[1] - (start - 1)), interval_idxs)) # upper and lower bounds l_idx = min(subd + [modidx], key=lambda y: y[1]) h_idx = max(subd + [modidx]) # recurse low pval = diptst(dat[:l_idx[1]], self.is_hist, self.ntrials)[1] if not pval is None and pval < self.alpha: rlidx = self._unidip(start, start + l_idx[0], False, debug) interval_idxs += rlidx # recurse high pval = diptst(dat[h_idx[0]:], self.is_hist, self.ntrials)[1] if not pval is None and pval < self.alpha: rhidx = self._unidip(start + h_idx[1], end, False, debug) interval_idxs += rhidx # return all intervals return interval_idxs