Example #1
0
def test2():
    df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv")
    name = ["汽车制造商", "型号名称", "发动机排量(L)", "制造年份", "气缸数量", "手动/自动"
        , "驱动类型", "城市里程/加仑", "公路里程/加仑", "汽油种类", "车辆类型"]
    print([*zip(df.columns, name)])
    DrawTools.kdeplot(df, 'cty', 'cyl')
    plt.show()
Example #2
0
def test6():
    # 产生正态分布的随机数
    np.random.seed(1)
    x1, y1 = np.random.normal(loc=5, scale=2, size=(2, 15))
    x2, y2 = np.random.normal(loc=8, scale=2.5, size=(2, 13))
    # 计算凸包
    # 画随机点
    plt.scatter(x1, y1)
    plt.scatter(x2, y2)
    DrawTools.drawPloygon(x1, y1, ax=None, ec="k", fc="gold", alpha=0.1)
    DrawTools.drawPloygon(x2,
                          y2,
                          ax=None,
                          ec="lightblue",
                          fc="none",
                          linewidth=1.5)

    plt.show()
Example #3
0
def test5():
    df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv")
    DrawTools.init()
    fig, ax = plt.subplots(figsize=(12, 8), dpi=80)
    # 用来画抖动图的函数:sns.stripplot
    sns.stripplot(df.cty, df.hwy
                  , jitter=0.25  # 抖动的幅度
                  , size=8, ax=ax
                  , linewidth=.5
                  , palette='Reds'
                  )
    # Decorations
    plt.title('Use jittered plots to avoid overlapping of points', fontsize=22)
    plt.rcParams['font.sans-serif'] = ['Simhei']
    plt.xlabel("气缸数量", fontsize=16)
    plt.ylabel("公路里程/加仑", fontsize=16)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()
Example #4
0
def test7():
    # 画 displ 发动机排量,cyl 气缸数量,hwy 公路里程/加仑  cyl是 4和8的
    data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv")
    DrawTools.init()
    fig = plt.figure(figsize=(16, 10), dpi=80)
    grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.2)
    ax_main = fig.add_subplot(grid[:-1, :-1])
    ax_right = fig.add_subplot(grid[:-1, -1], xticklabels=[], yticklabels=[])
    ax_bottom = fig.add_subplot(grid[-1, :-1], xticklabels=[], yticklabels=[])
    # displ 发动机排量(L)
    # hwy 公路里程/加仑
    DrawTools.scatter(data=data, feature_x='displ', feature_y='hwy', feature_c='manufacturer', feature_s='cty',
                      ax=ax_main)
    DrawTools.font_desc(ax_main, title='边缘直方图 \n 发动机排量 vs 公路里程/加仑', xlabel='发动机排量(L)', ylabel='公里路程/加仑')
    DrawTools.hist(data, feature_x='hwy', bins=40, vertical=False, color='deeppink', ax=ax_right)
    DrawTools.hist(data, feature_x='displ', bins=40, vertical=True, invert_y=True, color='deeppink',
                   ax=ax_bottom)

    xlabels = ax_main.get_xticks().tolist()  # 将现有的标尺取出来,转化为带一位小数的浮点数
    ax_main.set_xticklabels(xlabels)  # 再将带一位小数的浮点数变成标尺
    plt.show()
Example #5
0
def test4():
    data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv")
    # 画 cty 城市里程/加仑 和 hwy 公路里程/加仑 的 点图
    DrawTools.init()
    DrawTools.scatter(data=data, feature_x='cty', feature_y='hwy')
    DrawTools.font_desc(title='关系图', xlabel='城市里程/加仑', ylabel='公路里程/加仑', legends=None)
    plt.show()
Example #6
0
def test6():
    data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv")
    DrawTools.init()
    fig, ax = plt.subplots(figsize=(12, 8), dpi=80)
    DrawTools.stripplot(data, 'cty', 'hwy', ax)
    DrawTools.font_desc(ax, 'Use jittered plots to avoid overlapping of points', "气缸数量", "公路里程/加仑")
    plt.show()
Example #7
0
def test8():
    data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv")
    DrawTools.init()
    fig = plt.figure(figsize=(16, 10), dpi=80)
    grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.2)
    ax_main = fig.add_subplot(grid[:-1, :-1])
    ax_right = fig.add_subplot(grid[:-1, -1], xticklabels=[], yticklabels=[])
    ax_bottom = fig.add_subplot(grid[-1, :-1], xticklabels=[], yticklabels=[])
    # displ 发动机排量(L)
    # hwy 公路里程/加仑
    DrawTools.scatter(data=data, feature_x='displ', feature_y='hwy', feature_c='manufacturer', feature_s='cty',
                      ax=ax_main, xlim=(1, 7), ylim=(0, 50))
    DrawTools.font_desc(ax_main, title='边缘直方图 \n 发动机排量 vs 公路里程/加仑', xlabel='发动机排量(L)', ylabel='公里路程/加仑')
    # 对右侧和下方绘制箱线图
    DrawTools.boxplot(data, 'hwy', vertical=True, color="red", ax=ax_right)
    DrawTools.boxplot(data, 'displ', vertical=False, color="red", ax=ax_bottom)
    ax_bottom.set(xlabel='')
    ax_right.set(ylabel='')

    xlabels = ax_main.get_xticks().tolist()  # 将现有的标尺取出来,转化为带一位小数的浮点数
    ax_main.set_xticklabels(xlabels)  # 再将带一位小数的浮点数变成标尺
    plt.show()
Example #8
0
def test5():
    df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv")
    DrawTools.init()
    DrawTools.stripplot(df, 'class', 'hwy')
    DrawTools.boxplot(df, 'class', 'hwy', vertical=True)

    # sns.boxplot(x='class', y='hwy', data=df, notch=False)
    plt.show()
Example #9
0
def test9():
    df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mtcars.csv")
    name = ["英里/加仑", "气缸数量", "排量", "总马力", "驱动轴比", "重量"
        , "1/4英里所用时间", "引擎", "变速器", "前进档数", "化油器数量", "用油是否高效"
        , "汽车", "汽车名称"]
    df.columns = name
    DrawTools.init()
    DrawTools.heatmap(df)
    DrawTools.font_desc(ax=None, title=u'mtcars数据集的相关性矩阵', tick_rotation=45, tick_horizontalalignment='right')
    plt.show()
Example #10
0
def test2():
    df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv")
    # ['manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'cty','hwy', 'fl', 'class'],
    # name = ["汽车制造商","型号名称","发动机排量(L)","制造年份","气缸数量","手动/自动","驱动类型","城市里程/加仑","公路里程/加仑","汽油种类","车辆种类"]
    ##驱动类型:4:四轮,f:前轮,r:后轮
    # 能源种类:汽油,柴油,用电等等
    # 车辆种类:皮卡,SUV,小型,midsize中型等等
    # 城市里程/加仑,公路里程/加仑:表示使用没加仑汽油能够跑的英里数,所以这个数值越大代表汽车越节能
    # df = df.loc[df.cyl.isin([4, 8]), :]
    DrawTools.init()
    DrawTools.lmplot(data=df, feature_x='displ', feature_y='hwy', feature_h='cyl', xlim=(1, 7), ylim=(0, 50))
    DrawTools.font_desc(title='按气缸数分组的最佳拟合线散点图', xlabel='发动机排量(L)', ylabel='公路里程/加仑', legends=['气缸数量4', '气缸数量8'])
    plt.show()
Example #11
0
def test6():
    df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv")
    DrawTools.init()
    DrawTools.violinplot(df, 'class', 'hwy')
    DrawTools.stripplot(df, 'class', 'hwy')
    plt.show()
Example #12
0
def test4():
    df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv")
    DrawTools.init()
    DrawTools.displot(df, 'cty', 'class', xlim=(5, 35), ylim=(0, 0.8))
    plt.legend()
    plt.show()
Example #13
0
def test3():
    df = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv")
    DrawTools.kdeplot_mul(df, ['cty', 'displ', 'hwy'], 'cyl', (2, 2))
    plt.show()
Example #14
0
# 将合并的数据此时进行拆分  分为训练数据和测试数据
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]

X_train = dummy_train_df.values
# X_test = dummy_test_df.values

from xgboost import XGBRegressor

clf = XGBRegressor(max_depth=5, n_estimators=100)
test_score = np.sqrt(-cross_val_score(
    clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
print(np.mean(test_score))

# 用sklearn自带的cross validation方法来测试模型
# params = [1, 2, 3, 4, 5, 6]
# test_scores = []
# for param in params:
#     clf = XGBRegressor(max_depth=param)
#     test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
#     test_scores.append(np.mean(test_score))
# plt.plot(params, test_scores)
# plt.title("max_depth vs CV Error")
# plt.show()
# print(test_scores)
# print(test_scores.mean())
from tools import DrawTools
clf.fit(X_train, y_train)
DrawTools.feature_importance(clf, dummy_train_df.columns)
plt.show()
Example #15
0
    plt.legend()
    plt.show()


data = pd.read_csv('bank.csv', sep=';')
print(data.info())
# 7个
# age 年龄 balance 收入
int_cols = data.dtypes[data.dtypes == 'int64']
obj_cols = data.dtypes[data.dtypes == 'object']
print('整数型:%d 个' % int_cols.count())
print('对象型:%d 个' % obj_cols.count())
# data = standard(data)
data = encode(data)

DrawTools.init()

# for obj_col in obj_cols.index:
#     print(data[obj_col].value_counts())

DrawTools.heatmap(data)
plt.legend()
plt.show()

# 构建新特征
# train score: 0.939159
#  test score: 0.912707

# train score: 0.941925
#  test score: 0.918232
g = data.groupby(['month', 'day']).agg({
Example #16
0
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from tools import ModelTools
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# ['satisfaction_level', 'last_evaluation', 'number_project',
#        'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
#        'promotion_last_5years', 'sales', 'salary'],
data = pd.read_csv('data/HR_comma_sep.csv')
col = [
    'satisfaction_level', 'last_evaluation', 'number_project',
    'average_montly_hours', 'time_spend_company', 'Work_accident',
    'promotion_last_5years', 'salary'
]
data['salary'].replace({'low': 0, 'medium': 1, 'high': 2}, inplace=True)
DrawTools.init()
# DrawTools.displot_mul(data, feature_xs=col, feature_h='left', grid=(3, 3))
plt.show()
data = pd.get_dummies(data)

y = data['left']
X = data.loc[:, data.columns != 'left']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                y,
                                                test_size=0.2,
                                                random_state=0)
model = XGBClassifier()
model = model.fit(Xtrain, ytrain)
ytrain_pred = model.predict(Xtrain)
ytest_pred = model.predict(Xtest)
Example #17
0
def grid_search_cv(X,
                   y,
                   model,
                   grid_param,
                   metrics,
                   scoring,
                   kflod,
                   fpreproc=None,
                   weight=None):
    if (grid_param.get('n_estimators') is not None):
        cv_result = xgb_cv(X,
                           y,
                           model.get_xgb_params(),
                           num_boost_round=model.n_estimators,
                           metrics=metrics,
                           fpreproc=fpreproc,
                           missing=model.missing,
                           weight=weight)
        DrawTools.xgb_cv_results(cv_result, metrics)
        model.n_estimators = len(cv_result)
        grid_param.pop('n_estimators')
        print(
            'best:', {'n_estimators': len(cv_result)},
            cv_result.loc[len(cv_result) - 1, 'test-' + metrics[0] + '-mean'])
    if len(grid_param) > 0:
        print(grid_param)
        gsearch = GridSearchCV(model,
                               param_grid=grid_param,
                               scoring=scoring,
                               cv=kflod,
                               return_train_score=True)
        gsearch.fit(X, y)
        cv_results = gsearch.cv_results_
        mean_test_score = cv_results['mean_test_score']
        mean_train_score = cv_results['mean_train_score']
        plt.xticks(range(0, len(cv_results['params'])),
                   labels=[[round(param, 3) for param in params.values()]
                           for params in cv_results['params']],
                   rotation=90)
        plt.plot(mean_test_score, label='test')
        plt.plot(mean_train_score, label='train')
        plt.ylabel(scoring)
        plt.xlabel(list(cv_results['params'][0].keys()))
        plt.legend()
        plt.grid()
        plt.show()

        # param_grid_max_depth = param_grid_1['max_depth']
        # param_grid_min_child_weight = param_grid_1['min_child_weight']

        # mean_test_score = cv_results['mean_test_score'].reshape(len(param_grid_max_depth), len(param_grid_min_child_weight))
        # for i, value in enumerate(param_grid_max_depth):
        #     print(-mean_test_score[i])
        #     plt.plot(param_grid_min_child_weight, mean_test_score[i], label=value)
        # plt.xlabel('min_child_weight')
        # plt.legend()
        # plt.show()

        print('best:', gsearch.best_params_, gsearch.best_score_)
        model.set_params(**gsearch.best_params_)
    return model
Example #18
0
def test3():
    df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv")
    DrawTools.init()
    DrawTools.lmplot_mul(data=df, feature_x='displ', feature_y='hwy', feature_h='cyl', xlim=(1, 7), ylim=(0, 50))
    DrawTools.font_desc(title='按气缸数分组的最佳拟合线散点图', xlabel='发动机排量(L)', ylabel='公路里程/加仑', legends=['气缸数量4', '气缸数量8'])
    plt.show()
Example #19
0
def test10():
    df = sns.load_dataset('iris')
    DrawTools.init()
    DrawTools.pairplot(df, 'species')
    plt.show()
Example #20
0
        stock = stock.tail(240).reset_index()
        drawK(ax, stock)
        drawSMA(ax, stock, periods=[20, 60])
        drawDate(ax, stock)
        ax.set_ylabel(roe_data.name.iloc[index], fontproperties=font)
        # ax.legend()
        ax = flg.add_subplot(grid[0], grid[1], grid[1] * (row + 1) + index + 1)
        ax.plot(roe_data.iloc[index, 2:])
        ax.set_ylabel(roe_data.ts_code.iloc[index], fontproperties=font)
        print(row, grid[1] * row + index + 1, grid[1] * (row + 1) + index + 1)
    plt.show()


from tools import DrawTools

DrawTools.init()
industry = None
name = None
start = 0
end = 12
sort_by = 'roe_2019_4'

roe_data = pd.read_csv(base_path + 'roe.csv')
roe_data['roe_2019_pct'] = (roe_data['roe_2019_4'] - roe_data['roe_2018']) / roe_data['roe_2018']
roe_data['roe_2019_4_diff'] = roe_data['roe_2019_4'] - roe_data['roe_2019_3']
# roe_data = roe_data.loc[roe_data.roe_2019_3 < 0]
roe_data = roe_data.sort_values(by=sort_by, ascending=False)
draw_data = roe_data[['name', 'ts_code', 'roe_2019_1', 'roe_2019_2', 'roe_2019_3', 'roe_2019_4']]
draw_data = draw_data.iloc[start:end, :]
roe_mul(draw_data)
# k_mul(['600083.SH','002234.SZ','600768.SH','000526.SZ'],['中国','\xe4\xb8\x8a\xe6\xb5\xb7\xe6\x96\xb0\xe9\x98\xb3','c','d'])
Example #21
0
def test5():
    midwest = pd.read_csv(
        "https://raw.githubusercontent.com/selva86/datasets/master/midwest_filter.csv"
    )
    midwest_1 = midwest.loc[midwest.category.isin(['AHR', 'HAU', 'LHU']), :]
    plt = DrawTools.drawA(midwest_1,
                          feature_x='area',
                          feature_y='poptotal',
                          label_name='category',
                          feature_s='popasian',
                          feature_text='county',
                          title='Scatterplot of Midwest Area vs Population')
    plt.show()

    midwest.columns = [
        "城市ID", "郡", "州", "面积", "总人口", "人口密度", "白人人口", "非裔人口", "美洲印第安人人口",
        "亚洲人口", "其他人种人口", "白人所占比例", "非裔所占比例", "美洲印第安人所占比例", "亚洲人所占比例",
        "其他人种比例", "成年人口", "具有高中文凭的比率", "大学文凭比例", "有工作的人群比例", "已知贫困人口",
        "已知贫困人口的比例", "贫困线以下的人的比例", "贫困线以下的儿童所占比例", "贫困的成年人所占的比例",
        "贫困的老年人所占的比例", "是否拥有地铁", "标签", "点的尺寸"
    ]

    for i in range(3):
        midwest['c' + str(i)] = midwest['标签'].apply(lambda x: x[i])
    # 编码
    midwest.iloc[:, -3:] = OrdinalEncoder().fit_transform(midwest.iloc[:, -3:])
    midwest = midwest.loc[:, midwest.dtypes.values != 'O']  # O大写
    midwest.loc[:, midwest.dtypes.values ==
                'int64'] = midwest.loc[:, midwest.dtypes.values ==
                                       'int64'].astype(np.float64)
    midwest = midwest.iloc[:, [*range(1, 25), 26, 27,
                               28]]  # 删除'城市ID','点的尺寸'这一列
    # 标准化
    midwest.iloc[:, [*range(23)]] = StandardScaler().fit_transform(
        midwest.iloc[:, [*range(23)]])

    xtrain, xtest, ytrain, ytest = train_test_split(midwest.iloc[:, :-3],
                                                    midwest.iloc[:, -3:],
                                                    test_size=0.3,
                                                    random_state=420)
    for index in range(3):
        lr = LR(solver='newton-cg',
                multi_class='multinomial',
                random_state=420,
                max_iter=100**20)
        lr = lr.fit(xtrain, ytrain.iloc[:, index])
        print(lr.score(xtrain, ytrain.iloc[:, index]))
        print(lr.score(xtest, ytest.iloc[:, index]))
        coef = pd.DataFrame(lr.coef_).T

        if index < 2:
            coef['mean'] = abs(coef).mean(axis=1)  # 为什么要abs????
            coef['name'] = xtrain.columns
            coef.columns = ["Average", "High", "Low", "mean", "name"]
            coef = coef.sort_values(by='mean', ascending=False)

        else:
            coef.columns = ["value"]
            coef['name'] = xtrain.columns
            coef = coef.sort_values(by='value', ascending=False)
        print(coef.head())