def test2(): df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv") name = ["汽车制造商", "型号名称", "发动机排量(L)", "制造年份", "气缸数量", "手动/自动" , "驱动类型", "城市里程/加仑", "公路里程/加仑", "汽油种类", "车辆类型"] print([*zip(df.columns, name)]) DrawTools.kdeplot(df, 'cty', 'cyl') plt.show()
def test6(): # 产生正态分布的随机数 np.random.seed(1) x1, y1 = np.random.normal(loc=5, scale=2, size=(2, 15)) x2, y2 = np.random.normal(loc=8, scale=2.5, size=(2, 13)) # 计算凸包 # 画随机点 plt.scatter(x1, y1) plt.scatter(x2, y2) DrawTools.drawPloygon(x1, y1, ax=None, ec="k", fc="gold", alpha=0.1) DrawTools.drawPloygon(x2, y2, ax=None, ec="lightblue", fc="none", linewidth=1.5) plt.show()
def test5(): df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv") DrawTools.init() fig, ax = plt.subplots(figsize=(12, 8), dpi=80) # 用来画抖动图的函数:sns.stripplot sns.stripplot(df.cty, df.hwy , jitter=0.25 # 抖动的幅度 , size=8, ax=ax , linewidth=.5 , palette='Reds' ) # Decorations plt.title('Use jittered plots to avoid overlapping of points', fontsize=22) plt.rcParams['font.sans-serif'] = ['Simhei'] plt.xlabel("气缸数量", fontsize=16) plt.ylabel("公路里程/加仑", fontsize=16) plt.xticks(fontsize=12) plt.yticks(fontsize=12) plt.show()
def test7(): # 画 displ 发动机排量,cyl 气缸数量,hwy 公路里程/加仑 cyl是 4和8的 data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv") DrawTools.init() fig = plt.figure(figsize=(16, 10), dpi=80) grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.2) ax_main = fig.add_subplot(grid[:-1, :-1]) ax_right = fig.add_subplot(grid[:-1, -1], xticklabels=[], yticklabels=[]) ax_bottom = fig.add_subplot(grid[-1, :-1], xticklabels=[], yticklabels=[]) # displ 发动机排量(L) # hwy 公路里程/加仑 DrawTools.scatter(data=data, feature_x='displ', feature_y='hwy', feature_c='manufacturer', feature_s='cty', ax=ax_main) DrawTools.font_desc(ax_main, title='边缘直方图 \n 发动机排量 vs 公路里程/加仑', xlabel='发动机排量(L)', ylabel='公里路程/加仑') DrawTools.hist(data, feature_x='hwy', bins=40, vertical=False, color='deeppink', ax=ax_right) DrawTools.hist(data, feature_x='displ', bins=40, vertical=True, invert_y=True, color='deeppink', ax=ax_bottom) xlabels = ax_main.get_xticks().tolist() # 将现有的标尺取出来,转化为带一位小数的浮点数 ax_main.set_xticklabels(xlabels) # 再将带一位小数的浮点数变成标尺 plt.show()
def test4(): data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv") # 画 cty 城市里程/加仑 和 hwy 公路里程/加仑 的 点图 DrawTools.init() DrawTools.scatter(data=data, feature_x='cty', feature_y='hwy') DrawTools.font_desc(title='关系图', xlabel='城市里程/加仑', ylabel='公路里程/加仑', legends=None) plt.show()
def test6(): data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv") DrawTools.init() fig, ax = plt.subplots(figsize=(12, 8), dpi=80) DrawTools.stripplot(data, 'cty', 'hwy', ax) DrawTools.font_desc(ax, 'Use jittered plots to avoid overlapping of points', "气缸数量", "公路里程/加仑") plt.show()
def test8(): data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv") DrawTools.init() fig = plt.figure(figsize=(16, 10), dpi=80) grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.2) ax_main = fig.add_subplot(grid[:-1, :-1]) ax_right = fig.add_subplot(grid[:-1, -1], xticklabels=[], yticklabels=[]) ax_bottom = fig.add_subplot(grid[-1, :-1], xticklabels=[], yticklabels=[]) # displ 发动机排量(L) # hwy 公路里程/加仑 DrawTools.scatter(data=data, feature_x='displ', feature_y='hwy', feature_c='manufacturer', feature_s='cty', ax=ax_main, xlim=(1, 7), ylim=(0, 50)) DrawTools.font_desc(ax_main, title='边缘直方图 \n 发动机排量 vs 公路里程/加仑', xlabel='发动机排量(L)', ylabel='公里路程/加仑') # 对右侧和下方绘制箱线图 DrawTools.boxplot(data, 'hwy', vertical=True, color="red", ax=ax_right) DrawTools.boxplot(data, 'displ', vertical=False, color="red", ax=ax_bottom) ax_bottom.set(xlabel='') ax_right.set(ylabel='') xlabels = ax_main.get_xticks().tolist() # 将现有的标尺取出来,转化为带一位小数的浮点数 ax_main.set_xticklabels(xlabels) # 再将带一位小数的浮点数变成标尺 plt.show()
def test5(): df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv") DrawTools.init() DrawTools.stripplot(df, 'class', 'hwy') DrawTools.boxplot(df, 'class', 'hwy', vertical=True) # sns.boxplot(x='class', y='hwy', data=df, notch=False) plt.show()
def test9(): df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mtcars.csv") name = ["英里/加仑", "气缸数量", "排量", "总马力", "驱动轴比", "重量" , "1/4英里所用时间", "引擎", "变速器", "前进档数", "化油器数量", "用油是否高效" , "汽车", "汽车名称"] df.columns = name DrawTools.init() DrawTools.heatmap(df) DrawTools.font_desc(ax=None, title=u'mtcars数据集的相关性矩阵', tick_rotation=45, tick_horizontalalignment='right') plt.show()
def test2(): df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv") # ['manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'cty','hwy', 'fl', 'class'], # name = ["汽车制造商","型号名称","发动机排量(L)","制造年份","气缸数量","手动/自动","驱动类型","城市里程/加仑","公路里程/加仑","汽油种类","车辆种类"] ##驱动类型:4:四轮,f:前轮,r:后轮 # 能源种类:汽油,柴油,用电等等 # 车辆种类:皮卡,SUV,小型,midsize中型等等 # 城市里程/加仑,公路里程/加仑:表示使用没加仑汽油能够跑的英里数,所以这个数值越大代表汽车越节能 # df = df.loc[df.cyl.isin([4, 8]), :] DrawTools.init() DrawTools.lmplot(data=df, feature_x='displ', feature_y='hwy', feature_h='cyl', xlim=(1, 7), ylim=(0, 50)) DrawTools.font_desc(title='按气缸数分组的最佳拟合线散点图', xlabel='发动机排量(L)', ylabel='公路里程/加仑', legends=['气缸数量4', '气缸数量8']) plt.show()
def test6(): df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv") DrawTools.init() DrawTools.violinplot(df, 'class', 'hwy') DrawTools.stripplot(df, 'class', 'hwy') plt.show()
def test4(): df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv") DrawTools.init() DrawTools.displot(df, 'cty', 'class', xlim=(5, 35), ylim=(0, 0.8)) plt.legend() plt.show()
def test3(): df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv") DrawTools.kdeplot_mul(df, ['cty', 'displ', 'hwy'], 'cyl', (2, 2)) plt.show()
# 将合并的数据此时进行拆分 分为训练数据和测试数据 dummy_train_df = all_dummy_df.loc[train_df.index] dummy_test_df = all_dummy_df.loc[test_df.index] X_train = dummy_train_df.values # X_test = dummy_test_df.values from xgboost import XGBRegressor clf = XGBRegressor(max_depth=5, n_estimators=100) test_score = np.sqrt(-cross_val_score( clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) print(np.mean(test_score)) # 用sklearn自带的cross validation方法来测试模型 # params = [1, 2, 3, 4, 5, 6] # test_scores = [] # for param in params: # clf = XGBRegressor(max_depth=param) # test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) # test_scores.append(np.mean(test_score)) # plt.plot(params, test_scores) # plt.title("max_depth vs CV Error") # plt.show() # print(test_scores) # print(test_scores.mean()) from tools import DrawTools clf.fit(X_train, y_train) DrawTools.feature_importance(clf, dummy_train_df.columns) plt.show()
plt.legend() plt.show() data = pd.read_csv('bank.csv', sep=';') print(data.info()) # 7个 # age 年龄 balance 收入 int_cols = data.dtypes[data.dtypes == 'int64'] obj_cols = data.dtypes[data.dtypes == 'object'] print('整数型:%d 个' % int_cols.count()) print('对象型:%d 个' % obj_cols.count()) # data = standard(data) data = encode(data) DrawTools.init() # for obj_col in obj_cols.index: # print(data[obj_col].value_counts()) DrawTools.heatmap(data) plt.legend() plt.show() # 构建新特征 # train score: 0.939159 # test score: 0.912707 # train score: 0.941925 # test score: 0.918232 g = data.groupby(['month', 'day']).agg({
from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from tools import ModelTools from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score # ['satisfaction_level', 'last_evaluation', 'number_project', # 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', # 'promotion_last_5years', 'sales', 'salary'], data = pd.read_csv('data/HR_comma_sep.csv') col = [ 'satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'salary' ] data['salary'].replace({'low': 0, 'medium': 1, 'high': 2}, inplace=True) DrawTools.init() # DrawTools.displot_mul(data, feature_xs=col, feature_h='left', grid=(3, 3)) plt.show() data = pd.get_dummies(data) y = data['left'] X = data.loc[:, data.columns != 'left'] Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=0) model = XGBClassifier() model = model.fit(Xtrain, ytrain) ytrain_pred = model.predict(Xtrain) ytest_pred = model.predict(Xtest)
def grid_search_cv(X, y, model, grid_param, metrics, scoring, kflod, fpreproc=None, weight=None): if (grid_param.get('n_estimators') is not None): cv_result = xgb_cv(X, y, model.get_xgb_params(), num_boost_round=model.n_estimators, metrics=metrics, fpreproc=fpreproc, missing=model.missing, weight=weight) DrawTools.xgb_cv_results(cv_result, metrics) model.n_estimators = len(cv_result) grid_param.pop('n_estimators') print( 'best:', {'n_estimators': len(cv_result)}, cv_result.loc[len(cv_result) - 1, 'test-' + metrics[0] + '-mean']) if len(grid_param) > 0: print(grid_param) gsearch = GridSearchCV(model, param_grid=grid_param, scoring=scoring, cv=kflod, return_train_score=True) gsearch.fit(X, y) cv_results = gsearch.cv_results_ mean_test_score = cv_results['mean_test_score'] mean_train_score = cv_results['mean_train_score'] plt.xticks(range(0, len(cv_results['params'])), labels=[[round(param, 3) for param in params.values()] for params in cv_results['params']], rotation=90) plt.plot(mean_test_score, label='test') plt.plot(mean_train_score, label='train') plt.ylabel(scoring) plt.xlabel(list(cv_results['params'][0].keys())) plt.legend() plt.grid() plt.show() # param_grid_max_depth = param_grid_1['max_depth'] # param_grid_min_child_weight = param_grid_1['min_child_weight'] # mean_test_score = cv_results['mean_test_score'].reshape(len(param_grid_max_depth), len(param_grid_min_child_weight)) # for i, value in enumerate(param_grid_max_depth): # print(-mean_test_score[i]) # plt.plot(param_grid_min_child_weight, mean_test_score[i], label=value) # plt.xlabel('min_child_weight') # plt.legend() # plt.show() print('best:', gsearch.best_params_, gsearch.best_score_) model.set_params(**gsearch.best_params_) return model
def test3(): df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv") DrawTools.init() DrawTools.lmplot_mul(data=df, feature_x='displ', feature_y='hwy', feature_h='cyl', xlim=(1, 7), ylim=(0, 50)) DrawTools.font_desc(title='按气缸数分组的最佳拟合线散点图', xlabel='发动机排量(L)', ylabel='公路里程/加仑', legends=['气缸数量4', '气缸数量8']) plt.show()
def test10(): df = sns.load_dataset('iris') DrawTools.init() DrawTools.pairplot(df, 'species') plt.show()
stock = stock.tail(240).reset_index() drawK(ax, stock) drawSMA(ax, stock, periods=[20, 60]) drawDate(ax, stock) ax.set_ylabel(roe_data.name.iloc[index], fontproperties=font) # ax.legend() ax = flg.add_subplot(grid[0], grid[1], grid[1] * (row + 1) + index + 1) ax.plot(roe_data.iloc[index, 2:]) ax.set_ylabel(roe_data.ts_code.iloc[index], fontproperties=font) print(row, grid[1] * row + index + 1, grid[1] * (row + 1) + index + 1) plt.show() from tools import DrawTools DrawTools.init() industry = None name = None start = 0 end = 12 sort_by = 'roe_2019_4' roe_data = pd.read_csv(base_path + 'roe.csv') roe_data['roe_2019_pct'] = (roe_data['roe_2019_4'] - roe_data['roe_2018']) / roe_data['roe_2018'] roe_data['roe_2019_4_diff'] = roe_data['roe_2019_4'] - roe_data['roe_2019_3'] # roe_data = roe_data.loc[roe_data.roe_2019_3 < 0] roe_data = roe_data.sort_values(by=sort_by, ascending=False) draw_data = roe_data[['name', 'ts_code', 'roe_2019_1', 'roe_2019_2', 'roe_2019_3', 'roe_2019_4']] draw_data = draw_data.iloc[start:end, :] roe_mul(draw_data) # k_mul(['600083.SH','002234.SZ','600768.SH','000526.SZ'],['中国','\xe4\xb8\x8a\xe6\xb5\xb7\xe6\x96\xb0\xe9\x98\xb3','c','d'])
def test5(): midwest = pd.read_csv( "https://raw.githubusercontent.com/selva86/datasets/master/midwest_filter.csv" ) midwest_1 = midwest.loc[midwest.category.isin(['AHR', 'HAU', 'LHU']), :] plt = DrawTools.drawA(midwest_1, feature_x='area', feature_y='poptotal', label_name='category', feature_s='popasian', feature_text='county', title='Scatterplot of Midwest Area vs Population') plt.show() midwest.columns = [ "城市ID", "郡", "州", "面积", "总人口", "人口密度", "白人人口", "非裔人口", "美洲印第安人人口", "亚洲人口", "其他人种人口", "白人所占比例", "非裔所占比例", "美洲印第安人所占比例", "亚洲人所占比例", "其他人种比例", "成年人口", "具有高中文凭的比率", "大学文凭比例", "有工作的人群比例", "已知贫困人口", "已知贫困人口的比例", "贫困线以下的人的比例", "贫困线以下的儿童所占比例", "贫困的成年人所占的比例", "贫困的老年人所占的比例", "是否拥有地铁", "标签", "点的尺寸" ] for i in range(3): midwest['c' + str(i)] = midwest['标签'].apply(lambda x: x[i]) # 编码 midwest.iloc[:, -3:] = OrdinalEncoder().fit_transform(midwest.iloc[:, -3:]) midwest = midwest.loc[:, midwest.dtypes.values != 'O'] # O大写 midwest.loc[:, midwest.dtypes.values == 'int64'] = midwest.loc[:, midwest.dtypes.values == 'int64'].astype(np.float64) midwest = midwest.iloc[:, [*range(1, 25), 26, 27, 28]] # 删除'城市ID','点的尺寸'这一列 # 标准化 midwest.iloc[:, [*range(23)]] = StandardScaler().fit_transform( midwest.iloc[:, [*range(23)]]) xtrain, xtest, ytrain, ytest = train_test_split(midwest.iloc[:, :-3], midwest.iloc[:, -3:], test_size=0.3, random_state=420) for index in range(3): lr = LR(solver='newton-cg', multi_class='multinomial', random_state=420, max_iter=100**20) lr = lr.fit(xtrain, ytrain.iloc[:, index]) print(lr.score(xtrain, ytrain.iloc[:, index])) print(lr.score(xtest, ytest.iloc[:, index])) coef = pd.DataFrame(lr.coef_).T if index < 2: coef['mean'] = abs(coef).mean(axis=1) # 为什么要abs???? coef['name'] = xtrain.columns coef.columns = ["Average", "High", "Low", "mean", "name"] coef = coef.sort_values(by='mean', ascending=False) else: coef.columns = ["value"] coef['name'] = xtrain.columns coef = coef.sort_values(by='value', ascending=False) print(coef.head())