def plot_doubleXY_Mean(X, cols_h=None, cols_v=None, Y_cont=None, Y_cate=None, feature_cate=None, backend='seaborn', figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 两个变量X与Y(可以多个)的分析图。0-1离散型Y创建1的占比热力图;连续型Y创建均值热力图。本质上都是均值热力图。 输入值: X: 原始数据,dataframe类型 cols_h: 水平轴选取字段,list类型,默认为data的所有列 cols_v: 垂直轴选取字段,list类型,默认为data的所有列 Y_cont: 连续型Y值,Series或一维np.array或DataFrame Y_cate: 0-1离散型Y值(暂时只能支持两类,且数值为0和1),Series或一维np.array或DataFrame feature_cate: 离散型X变量字段,list类型,默认为空 backend: 画图后端,可选{'seaborn','matplotlib'} close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志。 输出值: fig_dict: X~Y关系图字典;键为二元组,第一个元素为水平轴字段名,第二个元素为垂直轴字段名,如('x1','x2');值为热力图对象 ''' data = X.copy() if cols_v is None: cols_v = list(data.columns) if cols_h is None: cols_h = list(data.columns) if feature_cate is None: feature_cate = [] #先对连续型变量离散化 feature_cont = [col for col in cols_v + cols_h if col not in feature_cate] clf = discretize.QuantileDiscretizer( feature_names=feature_cont, quantiles=[10 * i for i in range(1, 10)], return_numeric=False, fill_na='missing') data = clf.fit_transform(data) if (Y_cont is None) and (Y_cate is None): raise Exception('Y值未给定!') if (Y_cont is None) and (Y_cate is None): raise Exception('连续型和离散型Y值只能给定一种!') if Y_cate is not None: Y = pd.DataFrame(Y_cate) else: Y = pd.DataFrame(Y_cont) fig_dict = {} n = Y.shape[1] cols_Y = list(Y.columns) cols_Y.sort() for vcol in cols_v: for hcol in cols_h: if verbose: print(vcol, hcol) if (vcol == hcol) or (vcol, hcol) in fig_dict.keys(): continue fig, axes = plt.subplots(n, 1, figsize=figsize) if n == 1: axes = np.array([axes]) for i, col in enumerate(cols_Y): value = Y[col].groupby([data[hcol], data[vcol]]).mean().unstack(hcol) if backend == 'seaborn': value = value.reindex_axis(utils.sort(value.index.tolist(), ascending=False, pattern='\((.*?),', converter=float), axis=0) else: value = value.reindex_axis(utils.sort(value.index.tolist(), ascending=True, pattern='\((.*?),', converter=float), axis=0) value = value.reindex_axis(utils.sort(value.columns.tolist(), ascending=True, pattern='\((.*?),', converter=float), axis=1) value = value.fillna(0) if i == 0: title = 'Horizontal: %s <---> Vertical: %s\n%s' % ( hcol, vcol, col) else: title = col if backend == 'seaborn': if Y_cate is not None: sns.heatmap(value, ax=axes[i], annot=True, fmt='.2%') else: sns.heatmap(value, ax=axes[i], annot=True, fmt='g') axes[i].set_title(title) axes[i].set_xlabel('') axes[i].set_ylabel('') else: pc, _ = heatmap(value, ax=axes[i], xlabel='', ylabel='', xticklabels=value.columns, yticklabels=value.index, title=title) if backend != 'seaborn': plt.colorbar(pc, ax=axes.ravel().tolist()) plt.xticks(rotation=30) plt.yticks(rotation=30) fig_dict[(hcol, vcol)] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return fig_dict
def plot_singleXY_PercentInY(X, cols=None, Y_cont=None, Y_cate=None, feature_cate=None, figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 单一X与单一Y的分析图(Y组内比例)。0-1离散型Y创建数量和Y组内比例柱形图;连续型Y创建数量和Y均值柱形图 输入值: X: 原始数据,dataframe类型 cols: 选取字段,list类型,默认为data的所有列 Y_cont: 连续型Y值,Series或一维np.array Y_cate: 0-1离散型Y值,Series或一维np.array feature_cate: 离散型X变量字段,list类型,默认为空 close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志 输出值: fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value; 离散型Y上面那幅子图为数量柱形图,下面那幅子图为Y组内比例柱形图; 连续型Y上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图 ''' data = X.copy() if cols is None: cols = list(data.columns) if feature_cate is None: feature_cate = [] fig_dict = {} fig_dict = fig_dict.fromkeys(cols) if (Y_cont is None) and (Y_cate is None): raise Exception('Y值未给定!') if (Y_cont is None) and (Y_cate is None): raise Exception('连续型和离散型Y值只能给定一种!') if Y_cate is not None: Y_cate = pd.Series(Y_cate) for i, column in enumerate(cols): if verbose: print(column) if column not in feature_cate: clf = discretize.QuantileDiscretizer( quantiles=[20 * i for i in range(1, 5)], return_numeric=False, fill_na='missing') data[column] = clf.fit_transform(data[column]) count = pd.crosstab(data[column], Y_cate) count.columns.name = '' count.index.name = column count = count.reindex( utils.sort(count.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) ratio = count / count.sum() fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) count.plot(kind='bar', ax=axes[0], rot=0) axes[0].set_xlabel('') axes[0].set_ylabel('Count of Samples') axes[0].set_title(column) axes[0].legend(loc='best') ratio.plot(kind='bar', ax=axes[1], rot=0) axes[1].set_xlabel('') axes[1].set_ylabel('Percent in Category of Y') axes[1].legend(loc='best') if close: plt.close('all') fig_dict[column] = fig else: Y_cont = pd.Series(Y_cont) for i, column in enumerate(cols): if verbose: print(column) if column not in feature_cate: clf = discretize.QuantileDiscretizer( quantiles=[20 * i for i in range(1, 5)], return_numeric=False) data[column] = clf.fit_transform(data[column]) count = Y_cont.groupby(data[column]).count() count.name = '' count = count.reindex( utils.sort(count.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) ratio = Y_cont.groupby(data[column]).mean() ratio.name = '' ratio = ratio.reindex( utils.sort(ratio.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) count.plot(kind='bar', ax=axes[0], rot=0) axes[0].set_xlabel('') axes[0].set_ylabel('Count of Samples') axes[0].set_title(column) axes[0].legend(loc='best') ratio.plot(kind='bar', ax=axes[1], rot=0) axes[1].set_xlabel('') axes[1].set_ylabel('Mean of Y') axes[1].legend(loc='best') if close: plt.close('all') fig_dict[column] = fig if show_last: try: fig fig.show() except: pass return fig_dict
def plot_singleXY_Mean(X, Y, cols=None, feature_cate=None, normalize=True, figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 单一X与多个Y的分析图(Y均值)。0-1离散型Y创建数量和每个X类别中的1占比柱形图;连续型Y创建数量和Y均值柱形图。本质上都是均值柱形图。 输入值: X: 原始数据,dataframe类型 Y: 连续型或0-1离散型Y值,Series或一维np.array或DataFrame cols: 选取字段,list类型,默认为rawdata的所有列 feature_cate: 离散型X变量字段,list类型,默认为空 normalize: 是否对样本数量作归一化(即使用样本占比) close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志 输出值: fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value; 上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图。 ''' data = X.copy() legend = True if cols is None: cols = list(data.columns) if feature_cate is None: feature_cate = [] fig_dict = {} Ynew = pd.DataFrame(Y) if isinstance(Y, np.ndarray) and len(Y.shape) == 1: legend = False for i, col in enumerate(cols): if verbose: print(col) if col not in feature_cate: clf = discretize.QuantileDiscretizer( quantiles=[20 * i for i in range(1, 5)], return_numeric=False, fill_na='Missing') data[col] = clf.fit_transform(data[col]) value_count = Ynew.groupby(data[col]).count() if normalize: value_count = value_count / value_count.sum(axis=0) value_count = value_count.reindex( utils.sort(value_count.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) value_mean = Ynew.groupby(data[col]).mean() value_mean = value_mean.reindex( utils.sort(value_mean.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) value_count.plot(kind='bar', rot=30, ax=axes[0], legend=legend) value_mean.plot(kind='bar', rot=30, ax=axes[1], legend=legend) axes[0].set_xlabel('') axes[0].set_ylabel('Count of Samples') axes[0].set_title(col) axes[1].set_xlabel('') axes[1].set_ylabel('Mean of Y') fig_dict[col] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return fig_dict
def test(): #构建测试数据 np.random.seed(13) X = pd.DataFrame(np.random.randn(10, 4), columns=['cont1', 'cont2', 'cont3', 'cont4']) X['cate_two1'] = np.random.choice([0, 1], 10) X['cate_two2'] = np.random.choice([0, 1], 10) X['cate_mult1'] = np.random.choice([1, 2, 3, 4, 5], 10) X['cate_mult2'] = np.random.choice([1, 2, 3, 4, 5], 10) feature_cate_two = ['cate_two1', 'cate_two2'] feature_cate_mult = ['cate_mult1', 'cate_mult2'] #创建对象实例 model = BasicDataStruct(X, Y=None, feature_cate_two=feature_cate_two, feature_cate_mult=feature_cate_mult, model_type='classification') print(model.current_state()) #通过变换构造新变量,替换或者新增 #新增 x_exp = np.exp(model.X['cont1']) model.add(x_exp, replace=False, ignore=False, suffix='_exp') print(model.current_state()) #替换 model.add(np.sin(model.X[['cont2', 'cont3']]), replace=True) print(model.current_state()) #连续变量离散化后新增 import src.discretize as discretize clf = discretize.QuantileDiscretizer() model.add(clf.fit_transform(model.X['cont4']), feature_cate_mult=['cont4'], replace=False, ignore=False, suffix='dis') print(model.current_state()) #连续变量离散化后替换 clf = discretize.QuantileDiscretizer() model.add(clf.fit_transform(model.X[['cont1']]), feature_cate_mult=['cont1'], replace=True) print(model.current_state()) #删除已有变量 model.delete(features_todel=['cont1_exp', 'cate_mult1']) print(model.current_state()) #重命名 model.rename({'cont4dis': 'cont4_dis'}) print(model.current_state()) #改变离散型变量列表(增) model.addFeature(feature_cate_two=['cate_two1'], feature_cate_mult=['cont1']) print(model.current_state()) #改变离散型变量列表(删) model.delFeature(feature_cate_two=['cate_two1'], feature_cate_mult=['cont1']) print(model.current_state())
for col in result: result[col].savefig(outfile+'%s.png'%col) #两变量X与Y的分析图 outfile=path+'两变量X与Y的分析图/' result=DataAnalysis.plot_doubleXY_Mean(model.X,Y_cate=model.Y, feature_cate=model.feature_cate_two+model.feature_cate_mult, backend='seaborn',close=True,show_last=True,verbose=True) for cols in result: result[cols].savefig(outfile+'%s-%s.png'%(cols[0],cols[1])) #%%提取离散化特征(缺失值会被当作一类) clf_dis=discretize.QuantileDiscretizer(quantiles=[10*i for i in range(1,10)],fill_na='Missing',return_numeric=False) Xnew_dis=clf_dis.fit_transform(model.X[model.feature_cont]) model_dis=BasicDataStruct.BasicDataStruct(X=Xnew_dis,Y=None, feature_cate_two=[], feature_cate_mult=Xnew_dis.columns.tolist(), model_type='classification') print('连续特征离散化:') print(model_dis.current_state()) #%%原始变量缺失填补 clf_imputer_cont=Imputer(strategy='mean') clf_imputer_cate=Imputer(strategy='most_frequent') if model.feature_cont!=[]: model.X[model.feature_cont]=clf_imputer_cont.fit_transform(model.X[model.feature_cont])
def plot_doubleXY_Mean(X, cols_h=None, cols_v=None, Y_cont=None, Y_cate=None, feature_cate=None, quantiles=None, cuts=None, pattern='\((.*?),', str_nopattern=None, fontsize=12, backend='seaborn', figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 两个变量X与Y(可以多个)的分析图。0-1离散型Y创建1的占比热力图;连续型Y创建均值热力图。本质上都是均值热力图。 输入值: X: 原始数据,dataframe类型 cols_h: 水平轴选取字段,list类型,默认为data的所有列 cols_v: 垂直轴选取字段,list类型,默认为data的所有列 Y_cont: 连续型Y值,Series或一维np.array或DataFrame Y_cate: 0-1离散型Y值(暂时只能支持两类,且数值为0和1),Series或一维np.array或DataFrame feature_cate: 离散型X变量字段,list类型,默认为空 quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[10*i for i in range(1,10)] cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序 str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序 fontsize: int,字体大小 backend: 画图后端,可选{'seaborn','matplotlib'} close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志。 输出值: fig_dict: X~Y关系图字典;键为二元组,第一个元素为水平轴字段名,第二个元素为垂直轴字段名,如('x1','x2');值为热力图对象 ''' data = X.copy() if cols_v is None: cols_v = list(data.columns) if cols_h is None: cols_h = list(data.columns) if feature_cate is None: feature_cate = [] if quantiles is None: quantiles = {} if cuts is None: cuts = {} if str_nopattern is None: str_nopattern = {} for key in quantiles: quantiles[key] = np.sort(np.unique(quantiles[key])).tolist() for key in cuts: cuts[key] = np.sort(np.unique(cuts[key])) q_default = [10 * i for i in range(1, 10)] #先对连续型变量离散化 feature_cont = set( [col for col in cols_v + cols_h if col not in feature_cate]) if len(feature_cont) > 0: for column in feature_cont: clf = discretize.QuantileDiscretizer(quantiles=quantiles.get( column, q_default), return_numeric=False, fill_na='Missing') if column in cuts.keys(): clf.cuts = cuts[column] else: clf.fit(data[column]) data[column] = clf.transform(data[column]) data = data.fillna('Missing') if (Y_cont is None) and (Y_cate is None): raise Exception('Y值未给定!') if (Y_cont is None) and (Y_cate is None): raise Exception('连续型和离散型Y值只能给定一种!') if Y_cate is not None: Y = pd.DataFrame(Y_cate) else: Y = pd.DataFrame(Y_cont) fig_dict = {} n = Y.shape[1] cols_Y = list(Y.columns) cols_Y.sort() for vcol in cols_v: for hcol in cols_h: if verbose: print(vcol, hcol) if (vcol == hcol) or (vcol, hcol) in fig_dict.keys(): continue fig, axes = plt.subplots(n, 1, figsize=figsize) if n == 1: axes = np.array([axes]) for i, col in enumerate(cols_Y): value = Y[col].groupby([data[hcol], data[vcol]]).mean().unstack(hcol) if backend == 'seaborn': value = value.reindex_axis(utils.sort( value.index.tolist(), ascending=False, pattern=pattern, str_nopattern=str_nopattern.get(vcol, None), converter=float), axis=0) else: value = value.reindex_axis(utils.sort( value.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(vcol, None), converter=float), axis=0) value = value.reindex_axis(utils.sort( value.columns.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(hcol, None), converter=float), axis=1) value = value.fillna(0) if i == 0: title = 'Horizontal: %s <---> Vertical: %s\n%s' % ( hcol, vcol, col) else: title = col if backend == 'seaborn': if Y_cate is not None: sns.heatmap(value, ax=axes[i], annot=True, fmt='.2%') else: sns.heatmap(value, ax=axes[i], annot=True, fmt='g') axes[i].set_title(title, fontsize=fontsize) axes[i].set_xlabel('') axes[i].set_ylabel('') else: pc, _ = heatmap(value, ax=axes[i], xlabel='', ylabel='', xticklabels=value.columns, yticklabels=value.index, title=title, fontsize=fontsize) if backend != 'seaborn': plt.colorbar(pc, ax=axes.ravel().tolist()) plt.xticks(rotation=30) plt.yticks(rotation=30) fig_dict[(hcol, vcol)] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return fig_dict
def plot_singleXY_Mean(X, Y, cols=None, feature_cate=None, normalize=True, quantiles=None, cuts=None, pattern='\((.*?),', str_nopattern=None, ylabel=None, fontsize=12, figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 单一X与多个Y的分析图(Y均值)。0-1离散型Y创建数量和每个X类别中的1占比柱形图;连续型Y创建数量和Y均值柱形图。本质上都是均值柱形图。 输入值: X: 原始数据,dataframe类型 Y: 连续型或0-1离散型Y值,Series或一维np.array或DataFrame cols: 选取字段,list类型,默认为rawdata的所有列 feature_cate: 离散型X变量字段,list类型,默认为空 normalize: 是否对样本数量作归一化(即使用样本占比) quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[20*i for i in range(1,5)] cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序 str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序 ylabel: 二元列表,表示各个子图的纵轴标签,默认为['Count of Samples','Mean of Y'] fontsize: int,字体大小 close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志 输出值: fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value; 上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图。 ''' data = X.copy() legend = True if str_nopattern is None: str_nopattern = {} if cols is None: cols = list(data.columns) if feature_cate is None: feature_cate = [] if quantiles is None: quantiles = {} if cuts is None: cuts = {} for key in quantiles: quantiles[key] = np.sort(np.unique(quantiles[key])).tolist() for key in cuts: cuts[key] = np.sort(np.unique(cuts[key])) q_default = [20 * i for i in range(1, 5)] fig_dict = {} Ynew = pd.DataFrame(Y) if isinstance(Y, np.ndarray) or len(Y.shape) == 1: legend = False if ylabel is None: ylabel = ['Count of Samples', 'Mean of Y'] for i, col in enumerate(cols): if verbose: print(col) if col not in feature_cate: clf = discretize.QuantileDiscretizer(quantiles=quantiles.get( col, q_default), return_numeric=False, fill_na='Missing') if col in cuts.keys(): clf.cuts = cuts[col] else: clf.fit(data[col]) data[col] = clf.transform(data[col]) data[col] = data[col].fillna('Missing') value_count = Ynew.groupby(data[col]).count() if normalize: value_count = value_count / value_count.sum(axis=0) value_count = value_count.reindex( utils.sort(value_count.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(col, None), converter=float)) value_mean = Ynew.groupby(data[col]).mean() value_mean = value_mean.reindex( utils.sort(value_mean.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(col, None), converter=float)) fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) value_count.plot(kind='bar', rot=30, ax=axes[0], legend=legend, fontsize=fontsize) value_mean.plot(kind='bar', rot=30, ax=axes[1], legend=legend, fontsize=fontsize) axes[0].set_ylabel(ylabel[0], fontsize=fontsize) axes[0].set_title(col, fontsize=fontsize) axes[1].set_xlabel('') axes[1].set_ylabel(ylabel[1], fontsize=fontsize) fig_dict[col] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return fig_dict
def plot_singleXY_PercentInY(X, cols=None, Y_cont=None, Y_cate=None, feature_cate=None, quantiles=None, cuts=None, pattern='\((.*?),', str_nopattern=None, xlabel=None, ylabel=None, color_map=None, legend_map=None, fontsize=12, figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 单一X与单一Y的分析图(Y组内比例)。0-1离散型Y创建数量和Y组内比例柱形图;连续型Y创建数量和Y均值柱形图 输入值: X: 原始数据,dataframe类型 cols: 选取字段,list类型,默认为data的所有列 Y_cont: 连续型Y值,Series或一维np.array Y_cate: 0-1离散型Y值,Series或一维np.array feature_cate: 离散型X变量字段,list类型,默认为空 quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[20*i for i in range(1,5)] cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序 str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序 xlabel: 字符串,表示纵轴标签 ylabel: 二元列表,表示各个子图的纵轴标签,离散型Y默认为['Count of Samples','Percent in Category of Y'],连续型Y默认为['Count of Samples','Mean of Y'] color_map: 字典,表示离散型Y原始取值对应的柱形图颜色,如{1:'red',0:'blue'},只针对离散型Y。 legend_map: 字典,表示离散型Y原始取值与图例的对应关系,如{1:'bad',0:'good'},只针对离散型Y。 fontsize: int,字体大小 close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志 输出值: fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value; 离散型Y上面那幅子图为数量柱形图,下面那幅子图为Y组内比例柱形图; 连续型Y上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图 ''' data = X.copy() if cols is None: cols = list(data.columns) if feature_cate is None: feature_cate = [] if quantiles is None: quantiles = {} if cuts is None: cuts = {} if str_nopattern is None: str_nopattern = {} if xlabel is None: xlabel = '' if legend_map is None: legend_map = {} if color_map is None: color_map = {} for key in quantiles: quantiles[key] = np.sort(np.unique(quantiles[key])).tolist() for key in cuts: cuts[key] = np.sort(np.unique(cuts[key])) q_default = [20 * i for i in range(1, 5)] fig_dict = {} fig_dict = fig_dict.fromkeys(cols) if (Y_cont is None) and (Y_cate is None): raise Exception('Y值未给定!') if (Y_cont is None) and (Y_cate is None): raise Exception('连续型和离散型Y值只能给定一种!') if Y_cate is not None: Y_cate = pd.Series(Y_cate) if ylabel is None: ylabel = ['Count of Samples', 'Percent in Category of Y'] for i, column in enumerate(cols): if verbose: print(column) if column not in feature_cate: clf = discretize.QuantileDiscretizer(quantiles=quantiles.get( column, q_default), return_numeric=False, fill_na='Missing') if column in cuts.keys(): clf.cuts = cuts[column] else: clf.fit(data[column]) data[column] = clf.transform(data[column]) data[column] = data[column].fillna('Missing') count = pd.crosstab(data[column], Y_cate) count.columns.name = '' count.index.name = column count = count.reindex( utils.sort(count.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(column, None), converter=float)) color = count.columns.map( lambda xx: color_map.get(xx, None)).tolist() count.columns = count.columns.map( lambda xx: legend_map.get(xx, xx)) ratio = count / count.sum() fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) count.plot(kind='bar', ax=axes[0], rot=0, fontsize=fontsize, color=color) axes[0].set_ylabel(ylabel[0], fontsize=fontsize) axes[0].set_title(column, fontsize=fontsize) axes[0].legend(loc='best', fontsize=fontsize) ratio.plot(kind='bar', ax=axes[1], rot=0, color=color) axes[1].set_xlabel(xlabel, fontsize=fontsize) axes[1].set_ylabel(ylabel[1], fontsize=fontsize) axes[1].legend(loc='best', fontsize=fontsize) if close: plt.close('all') fig_dict[column] = fig else: Y_cont = pd.Series(Y_cont) if ylabel is None: ylabel = ['Count of Samples', 'Mean of Y'] for i, column in enumerate(cols): if verbose: print(column) if column not in feature_cate: clf = discretize.QuantileDiscretizer(quantiles=quantiles.get( column, q_default), return_numeric=False, fill_na='Missing') if column in cuts.keys(): clf.cuts = cuts[column] else: clf.fit(data[column]) data[column] = clf.transform(data[column]) data[column] = data[column].fillna('Missing') count = Y_cont.groupby(data[column]).count() count.name = '' count = count.reindex( utils.sort(count.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(column, None), converter=float)) ratio = Y_cont.groupby(data[column]).mean() ratio.name = '' ratio = ratio.reindex( utils.sort(ratio.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) count.plot(kind='bar', ax=axes[0], rot=0, fontsize=fontsize) axes[0].set_ylabel(ylabel[0], fontsize=fontsize) axes[0].set_title(column, fontsize=fontsize) axes[0].legend(loc='best', fontsize=fontsize) ratio.plot(kind='bar', ax=axes[1], rot=0, fontsize=fontsize) axes[1].set_xlabel(xlabel, fontsize=fontsize) axes[1].set_ylabel(ylabel[1], fontsize=fontsize) axes[1].legend(loc='best', fontsize=fontsize) if close: plt.close('all') fig_dict[column] = fig if show_last: try: fig fig.show() except: pass return fig_dict