def test_position_from_geom(): geom = geom_point(position='jitter') assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position='position_jitter') assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position=position_jitter()) assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position=position_jitter) assert isinstance(position.from_geom(geom), position_jitter)
def test_position_from_geom(): geom = geom_point(position='jitter') assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position='position_jitter') assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position=position_jitter()) assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position=position_jitter) assert isinstance(position.from_geom(geom), position_jitter)
def test_jitter(): df1 = pd.DataFrame({'x': [1, 2, 1, 2], 'y': [1, 1, 2, 2]}) p = (ggplot(df1, aes('x', 'y')) + geom_point(size=10) + geom_jitter(size=10, color='red', random_state=random_state) + geom_jitter(size=10, color='blue', width=0.1, height=0.1, random_state=random_state)) assert p + _theme == 'jitter' with pytest.raises(PlotnineError): geom_jitter(position=position_jitter(), width=0.1)
def test_jitter(): df1 = pd.DataFrame({'x': [1, 2, 1, 2], 'y': [1, 1, 2, 2]}) p = (ggplot(df1, aes('x', 'y')) + geom_point(size=10) + geom_jitter(size=10, color='red', random_state=random_state) + geom_jitter(size=10, color='blue', width=0.1, height=0.1, random_state=random_state)) assert p + _theme == 'jitter' with pytest.raises(PlotnineError): geom_jitter(position=position_jitter(), width=0.1)
def plot_portfolio_stock_performance(ld: LazyDictionary, figure_width: int = 12, date_text_size=7) -> p9.ggplot: df = ld["df"] df = df[df["stock_cost"] > 0.0] # latest_date = df.iloc[-1, 6] # latest_profit = df[df["date"] == latest_date] # print(df) pivoted_df = df.pivot(index="stock", columns="date", values="stock_profit") latest_date = pivoted_df.columns[-1] # print(latest_date) mean_profit = pivoted_df.mean(axis=1) n_stocks = len(mean_profit) # if we want ~4 stocks per facet plot, then we need to specify the appropriate calculation for df.qcut() bins = pd.qcut(mean_profit, int(100 / n_stocks) + 1) # print(bins) df = df.merge(bins.to_frame(name="bins"), left_on="stock", right_index=True) # print(df) textual_df = df[df["date"] == latest_date] # print(textual_df) # melted_df = make_portfolio_dataframe(df, melt=True) plot = (p9.ggplot( df, p9.aes("date", "stock_profit", group="stock", colour="stock")) + p9.geom_smooth(size=1.0, span=0.3, se=False) + p9.facet_wrap("~bins", ncol=1, nrow=len(bins), scales="free_y") + p9.geom_text( p9.aes(x="date", y="stock_profit", label="stock"), color="black", size=9, data=textual_df, position=p9.position_jitter(width=10, height=10), )) return user_theme( plot, y_axis_label="$ AUD", figure_size=(figure_width, int(len(bins) * 1.2)), axis_text_x=p9.element_text(angle=30, size=date_text_size), )
def MDplot(Data, Names=None, Ordering='Default', Scaling=None, Fill='darkblue', RobustGaussian=True, GaussianColor='magenta', Gaussian_lwd=1.5, BoxPlot=False, BoxColor='darkred', MDscaling='width', LineColor='black', LineSize=0.01, QuantityThreshold=40, UniqueValuesThreshold=12, SampleSize=500000, SizeOfJitteredPoints=1, OnlyPlotOutput=True, ValueColumn=None, ClassColumn=None): """ Plots a mirrored density plot for each numeric column Args: Data (dataframe): dataframe containing data. Each column is one variable (wide table format, for long table format see ValueColumn and ClassColumn) Names (list): list of column names (will be used if data is not a dataframe) Ordering (str): 'Default', 'Columnwise', 'Alphabetical' or 'Statistics' Scaling (str): scaling method, one of: Percentalize, CompleteRobust, Robust, Log Fill (str): color of MD-Plot RobustGaussian (bool): draw a gaussian distribution if column is gaussian GaussianColor (str): color for gaussian distribution Gaussian_lwd (float): line width of gaussian distribution BoxPlot (bool): draw box-plot BoxColor (str): color for box-plots MDscaling (str): scale of ggplot violin LineSize (float): line width of ggplot violin QuantityThreshold (int): minimal number of rows UniqueValuesThreshold (int): minimal number of unique values per column SampleSize (int): number of samples used if number of rows is larger than SampleSize OnlyPlotOutput (bool): if True than returning only ggplot object, if False than returning dictionary containing ggplot object and additional infos ValueColumn (str): name of the column of values to be plotted (data in long table format) ClassColumn (str): name of the column with class identifiers for the value column (data in long table format) Returns: ggplot object or dictionary containing ggplot object and additional infos """ if not isinstance(Data, pd.DataFrame): try: if Names is not None: Data = pd.DataFrame(Data, columns=Names) else: Data = pd.DataFrame(Data) lstCols = list(Data.columns) dctCols = {} for strCol in lstCols: dctCols[strCol] = "C_" + str(strCol) Data = Data.rename(columns=dctCols) except: raise Exception("Data cannot be converted into pandas dataframe") else: Data = Data.reset_index(drop=True) if ValueColumn is not None and ClassColumn is not None: lstCols = list(Data.columns) if ValueColumn not in lstCols: raise Exception("ValueColumn not contained in dataframe") if ClassColumn not in lstCols: raise Exception("ClassColumn not contained in dataframe") lstClasses = list(Data[ClassColumn].unique()) DataWide = pd.DataFrame() for strClass in lstClasses: if len(DataWide) == 0: DataWide = Data[Data[ClassColumn] == strClass].copy()\ .reset_index(drop=True) DataWide = DataWide.rename(columns={ValueColumn: strClass}) DataWide = DataWide[[strClass]] else: dfTemp = Data[Data[ClassColumn] == strClass].copy()\ .reset_index(drop=True) dfTemp = dfTemp.rename(columns={ValueColumn: strClass}) dfTemp = dfTemp[[strClass]] DataWide = DataWide.join(dfTemp, how='outer') Data = DataWide.copy() lstCols = list(Data.columns) for strCol in lstCols: if not is_numeric_dtype(Data[strCol]): print("Deleting non numeric column: " + strCol) Data = Data.drop([strCol], axis=1) else: if abs(Data[strCol].sum()) == np.inf: print("Deleting infinite column: " + strCol) Data = Data.drop([strCol], axis=1) Data = Data.rename_axis("index", axis="index")\ .rename_axis("variable", axis="columns") dvariables = Data.shape[1] nCases = Data.shape[0] if nCases > SampleSize: print('Data has more cases than "SampleSize". Drawing a sample for ' 'faster computation. You can omit this by setting ' '"SampleSize=len(data)".') sampledIndex = np.sort( np.random.choice(list(Data.index), size=SampleSize, replace=False)) Data = Data.loc[sampledIndex] nPerVar = Data.apply(lambda x: len(x.dropna())) nUniquePerVar = Data.apply(lambda x: len(list(x.dropna().unique()))) # renaming columns to nonumeric names lstCols = list(Data.columns) dctCols = {} for strCol in lstCols: try: a = float(strCol) dctCols[strCol] = "C_" + str(strCol) except: dctCols[strCol] = str(strCol) Data = Data.rename(columns=dctCols) if Scaling == "Percentalize": Data = Data.apply(lambda x: 100 * (x - x.min()) / (x.max() - x.min())) if Scaling == "CompleteRobust": Data = robust_normalization(Data, centered=True, capped=True) if Scaling == "Robust": Data = robust_normalization(Data, centered=False, capped=False) if Scaling == "Log": Data = signed_log(Data, base="Ten") if RobustGaussian == True: RobustGaussian = False print("log with robust gaussian does not work, because mean and " "variance is not valid description for log normal data") #_______________________________________________Roboust Gaussian and Statistics if RobustGaussian == True or Ordering == "Statistics": Data = Data.applymap(lambda x: np.nan if abs(x) == np.inf else x) if nCases < 50: warnings.warn("Sample is maybe too small for statistical testing") factor = pd.Series([0.25, 0.75]).apply(lambda x: abs(norm.ppf(x)))\ .sum() std = Data.std() dfQuartile = Data.apply( lambda x: mquantiles(x, [0.25, 0.75], alphap=0.5, betap=0.5)) dfQuartile = dfQuartile.append(dfQuartile.loc[1] - dfQuartile.loc[0], ignore_index=True) dfQuartile.index = ["low", "hi", "iqr"] dfMinMax = Data.apply( lambda x: mquantiles(x, [0.001, 0.999], alphap=0.5, betap=0.5)) dfMinMax.index = ["min", "max"] shat = pd.Series() mhat = pd.Series() nonunimodal = pd.Series() skewed = pd.Series() bimodalprob = pd.Series() isuniformdist = pd.Series() nSample = max([10000, nCases]) normaldist = np.empty((nSample, dvariables)) normaldist[:] = np.nan normaldist = pd.DataFrame(normaldist, columns=lstCols) for strCol in lstCols: shat[strCol] = min( [std[strCol], dfQuartile[strCol].loc["iqr"] / factor]) mhat[strCol] = trim_mean(Data[strCol].dropna(), 0.1) if nCases > 45000 and nPerVar[strCol] > 8: # statistical testing does not work with to many cases sampledIndex = np.sort( np.random.choice(list(Data.index), size=45000, replace=False)) vec = Data[strCol].loc[sampledIndex] if nUniquePerVar[strCol] > UniqueValuesThreshold: nonunimodal[strCol] = dip.diptst(vec.dropna(), numt=100)[1] skewed[strCol] = skewtest(vec)[1] args = (dfMinMax[strCol].loc["min"], dfMinMax[strCol].loc["max"] \ - dfMinMax[strCol].loc["min"]) isuniformdist[strCol] = kstest(vec, "uniform", args)[1] bimodalprob[strCol] = bimodal(vec)["Bimodal"] else: print("Not enough unique values for statistical testing, " "thus output of testing is ignored.") nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 elif nPerVar[strCol] < 8: warnings.warn("Sample of finite values to small to calculate " "agostino.test or dip.test for " + strCol) nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 else: if nUniquePerVar[strCol] > UniqueValuesThreshold: nonunimodal[strCol] = dip.diptst(Data[strCol].dropna(), numt=100)[1] skewed[strCol] = skewtest(Data[strCol])[1] args = (dfMinMax[strCol].loc["min"], dfMinMax[strCol].loc["max"] \ - dfMinMax[strCol].loc["min"]) isuniformdist[strCol] = kstest(Data[strCol], "uniform", args)[1] bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"] else: print("Not enough unique values for statistical testing, " "thus output of testing is ignored.") nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 if isuniformdist[strCol] < 0.05 and nonunimodal[strCol] > 0.05 \ and skewed[strCol] > 0.05 and bimodalprob[strCol] < 0.05 \ and nPerVar[strCol] > QuantityThreshold \ and nUniquePerVar[strCol] > UniqueValuesThreshold: normaldist[strCol] = np.random.normal(mhat[strCol], shat[strCol], nSample) normaldist[strCol] = normaldist[strCol]\ .apply(lambda x: np.nan if x < Data[strCol].min() \ or x > Data[strCol].max() else x) nonunimodal[nonunimodal == 0] = 0.0000000001 skewed[skewed == 0] = 0.0000000001 effectStrength = (-10 * np.log(skewed) - 10 * np.log(nonunimodal)) / 2 #______________________________________________________________________Ordering if Ordering == "Default": bimodalprob = pd.Series() for strCol in lstCols: if nCases > 45000 and nPerVar[strCol] > 8: sampledIndex = np.sort( np.random.choice(list(Data.index), size=45000, replace=False)) vec = Data[strCol].loc[sampledIndex] bimodalprob[strCol] = bimodal(vec)["Bimodal"] elif nPerVar[strCol] < 8: bimodalprob[strCol] = 0 else: bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"] if len(list(bimodalprob.unique())) < 2 and dvariables > 1 \ and RobustGaussian == True: rangfolge = list(effectStrength.sort_values(ascending=False).index) print("Using statistics for ordering instead of default") else: rangfolge = list(bimodalprob.sort_values(ascending=False).index) if Ordering == "Columnwise": rangfolge = lstCols if Ordering == "Alphabetical": rangfolge = lstCols.copy() rangfolge.sort() if Ordering == "Statistics": rangfolge = list(effectStrength.sort_values(ascending=False).index) #________________________________________________________________Data Reshaping if nPerVar.min() < QuantityThreshold \ or nUniquePerVar.min() < UniqueValuesThreshold: warnings.warn("Some columns have less than " + str(QuantityThreshold) + " data points or less than " + str(UniqueValuesThreshold) + " unique values. Changing from MD-plot to Jitter-Plot " "for these columns.") dataDensity = Data.copy() mm = Data.median() for strCol in lstCols: if nPerVar[strCol] < QuantityThreshold \ or nUniquePerVar[strCol] < UniqueValuesThreshold: if mm[strCol] != 0: dataDensity[strCol] = mm[strCol] \ * np.random.uniform(-0.001, 0.001, nCases) + mm[strCol] else: dataDensity[strCol] = np.random.uniform( -0.001, 0.001, nCases) # Generates in the cases where pdf cannot be estimated a scatter plot dataJitter = dataDensity.copy() # Delete all scatters for features where distributions can be estimated for strCol in lstCols: if nPerVar[strCol] >= QuantityThreshold \ and nUniquePerVar[strCol] >= UniqueValuesThreshold: dataJitter[strCol] = np.nan #apply ordering dataframe = dataDensity[rangfolge].reset_index()\ .melt(id_vars=["index"]) else: dataframe = Data[rangfolge].reset_index().melt(id_vars=["index"]) dctCols = {"index": "ID", "variable": "Variables", "value": "Values"} dataframe = dataframe.rename(columns=dctCols) #______________________________________________________________________Plotting plot = p9.ggplot(dataframe, p9.aes(x="Variables", group="Variables", y="Values")) \ + p9.scale_x_discrete(limits=rangfolge) plot = plot + p9.geom_violin(stat = stat_pde_density(scale=MDscaling), fill=Fill, colour=LineColor, size=LineSize, trim=True) \ + p9.theme(axis_text_x=p9.element_text(rotation=90)) if nPerVar.min() < QuantityThreshold \ or nUniquePerVar.min() < UniqueValuesThreshold: dataframejitter = dataJitter[rangfolge].reset_index()\ .melt(id_vars=["index"]) dataframejitter = dataframejitter.rename(columns=dctCols) plot = plot + p9.geom_jitter( size=SizeOfJitteredPoints, data=dataframejitter, colour=LineColor, mapping=p9.aes(x="Variables", group="Variables", y="Values"), position=p9.position_jitter(0.15)) if RobustGaussian == True: dfTemp = normaldist[rangfolge].reset_index().melt(id_vars=["index"]) dfTemp = dfTemp.rename(columns=dctCols) if dfTemp["Values"].isnull().all() == False: plot = plot + p9.geom_violin( data=dfTemp, mapping=p9.aes(x="Variables", group="Variables", y="Values"), colour=GaussianColor, alpha=0, scale=MDscaling, size=Gaussian_lwd, na_rm=True, trim=True, fill=None, position="identity", width=1) if BoxPlot == True: plot = plot + p9.stat_boxplot(geom = "errorbar", width = 0.5, color=BoxColor) \ + p9.geom_boxplot(width=1, outlier_colour = None, alpha=0, fill='#ffffff', color=BoxColor, position="identity") if OnlyPlotOutput == True: return plot else: print(plot) return { "Ordering": rangfolge, "DataOrdered": Data[rangfolge], "ggplotObj": plot }