Beispiel #1
0
def run_main():
    """
        主函数
    """
    # Step.0 加载数据
    filepath = './dataset/Airplane_Crashes_and_Fatalities_Since_1908.csv'
    air_data = pd.read_csv(filepath)

    # Step.1 查看数据
    inspect_dataset(air_data)

    # Step.2 数据转换
    air_data = add_year_to_data(air_data)

    # Step.3 数据分析及可视化
    # Step. 3.1 空难数vs年份分析
    plot_crashes_vs_year(air_data, 'sns')
    plot_crashes_vs_year(air_data, 'bokeh')

    # Step. 3.2 乘客数量vs遇难数vs年份分析
    plot_aboard_vs_fatalities_vs_year(air_data, 'sns')
    plot_aboard_vs_fatalities_vs_year(air_data, 'bokeh')

    # Step. 3
    plot_top_n(air_data, 'Type', top_n=20,
               save_file_path='./output/top_50_type.csv',
               save_fig_path='./output/top_50_type.png')
    plot_top_n(air_data, 'Operator', top_n=20,
               save_file_path='./output/top_50_operator.csv',
               save_fig_path='./output/top_50_operator.png')
def run_main():
    """
            主函数
    """

    ## Step.0 加载数据
    df_data = pd.read_csv(dataset_path)

    ## Step.1 查看数据
    inspect_dataset(df_data)

    ## Step.2 处理缺失数据
    df_data = process_missing_data(df_data)

    ## Step.3.1 可视化战队属性,这里选4个属性作为例子展示
    column_names = [
        'LeagueIndex',  # 战队索引号
        'HoursPerWeek',  # 每周游戏时间
        'Age',  # 战队中玩家的年龄
        'APM',  # 手速
        'WorkersMade'  # 单位时间的建造数
    ]
    visualize_league_attributes(df_data[column_names])

    ## Step3.2 可视化战队属性统计值
    visualize_league_attribute_stats(
        df_data[column_names],
        'APM',
        savedata_path='./league_apm_stats.csv',
        savefig_path='./league_apm_stats.png',
    )

    visualize_league_attribute_stats(
        df_data[column_names],
        'HoursPerWeek',
        savedata_path='./league_hrs_stats.csv',
        savefig_path='./league_hrs_stats.png',
    )
Beispiel #3
0
def run_main():
    """
            主函数
    """

    ## 解压数据集
    print "解压zip...",
    unzip(zip_filepath, dataset_path)
    print "完成."

    ## 1. 查看数据集
    df_data = pd.read_csv(dataset_filepath)
    inspect_dataset(df_data)

    ## 2. 处理缺失数据
    df_data = process_missing_data(df_data)

    ## 3. 数据处理构建特征,并重构数据
    # 获取重构“成对”数据,以便放入预测模型
    pair_data, labels, features = get_pair_data(df_data)

    # 进行特征选择
    if is_feat_select:
        pair_data, selected_features = select_features(pair_data, labels,
                                                       features)
        print '选择的特征:',
        print selected_features

    n_pos_samples = labels[labels == 1].shape[0]
    n_neg_samples = labels[labels == 0].shape[0]
    print '正样本数:%d' % n_pos_samples
    print '负样本数:%d' % n_neg_samples

    # 处理非平衡数据
    if is_process_unbalanced_data:
        pair_data, labels = balance_samples(pair_data, labels)

    # 分割训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        pair_data, labels, test_size=0.1, random_state=random_seed)

    ## 4.训练模型,测试模型
    print "逻辑回归模型:"
    logistic_model = train_model(X_train,
                                 y_train,
                                 model_name='logistic_regression',
                                 is_cv=is_cv)
    logistic_model_predictions = logistic_model.predict(X_test)
    logistic_model_prob_predictions = logistic_model.predict_proba(X_test)
    # 输出预测结果
    print_test_results(y_test, logistic_model_predictions,
                       logistic_model_prob_predictions)

    print "支持向量机模型:"
    svm_model = train_model(X_train, y_train, model_name='svm', is_cv=is_cv)
    svm_model_predictions = svm_model.predict(X_test)
    svm_model_prob_predictions = svm_model.predict_proba(X_test)
    # 输出预测结果
    print_test_results(y_test, svm_model_predictions,
                       svm_model_prob_predictions)

    print "随机森林模型:"
    rf_model = train_model(X_train,
                           y_train,
                           model_name='random_forest',
                           is_cv=is_cv)
    rf_model_predictios = rf_model.predict(X_test)
    rf_model_prob_predictios = rf_model.predict_proba(X_test)
    # 输出预测结果
    print_test_results(y_test, rf_model_predictios, rf_model_prob_predictios)

    ## 5. 绘制ROC曲线
    plot_roc(y_test,
             logistic_model_prob_predictions,
             fig_title='Logistic Regression',
             savepath='./lr_roc.png')
    plot_roc(y_test,
             svm_model_prob_predictions,
             fig_title='SVM',
             savepath='./svm_roc.png')
    plot_roc(y_test,
             rf_model_prob_predictios,
             fig_title='Random Forest',
             savepath='./rf_roc.png')

    # 删除解压数据,清理空间
    if os.path.exists(dataset_filepath):
        print '分析结束,清理空间。'
        os.remove(dataset_filepath)
Beispiel #4
0
        league_ser, stats_min_ser, stats_max_ser, stats_max_ser, stats_mean_ser
    ],
                           axis=1)

    print stats_data

    fig = plt.figure(figsize=(10.0, 10.0))
    # axs=fig.add_subplot(1,1,1)
    plt.xlabel(u'战队')
    plt.title(u'APM statics')
    plt.plot(stats_data['LeagueIndex'], stats_data['min'], color='green')
    plt.plot(stats_data['LeagueIndex'], stats_data['max'], color='red')
    plt.plot(stats_data['LeagueIndex'], stats_data['mean'], color='blue')
    blue_patch = mpatches.Patch(color='blue', label='Average ' + attr_label)
    green_patch = mpatches.Patch(color='green', label='Min ' + attr_label)
    red_patch = mpatches.Patch(color='red', label='Max ' + attr_label)
    plt.legend(handles=[blue_patch, red_patch, green_patch])
    plt.show()

    return stats_data


if __name__ == '__main__':
    filepath = 'D:\little_elephant\lecture05_codes\codes\lecture05_proj\dataset\\starcraft.csv'
    df_data = inspect_dataset(filepath)
    process_missing_data(df_data)
    column_names = ['LeagueIndex', 'APM']
    # print df_data[column_names]
    status_data = visualize_league_attribute_status(df_data[column_names],
                                                    'APM')
def run_main():
    """
        主函数
    """
    ## Step.0 加载数据
    df_data = pd.read_csv(dataset_path)

    ## Step.1 查看数据
    inspect_dataset(df_data)

    ## Step.2 处理缺失数据
    df_data = process_missing_data(df_data)

    ## Step.3 使用分组统计数据集的基本信息
    ## Step.3.1 查看票房收入统计 (可传入多个列名进行分析)
    # 导演vs票房总收入
    analyze_gross(df_data, ['director_name'], './output/director_gross.csv')

    # 主演vs票房总收入
    analyze_gross(df_data, ['actor_1_name'], './output/actor_gross.csv')

    # 导演+主演vs票房收入
    analyze_gross(df_data, ['director_name', 'actor_1_name'],
                  './output/director_actor_gross.csv')

    # Step.3.2 查看imdb评分统计
    # 查看各imdb评分的电影个数
    df_ratings = df_data.groupby('imdb_score')['movie_title'].count()
    plt.figure()
    df_ratings.plot()
    plt.savefig('./output/imdb_scores.png')
    plt.show()

    # 查看top20导演的平均imdb评分
    df_director_mean_ratings = df_data.groupby(
        'director_name')['imdb_score'].mean()
    top20_imdb_directors = df_director_mean_ratings.sort_values(
        ascending=False)[:20]
    plt.figure(figsize=(18.0, 10.0))
    top20_imdb_directors.plot(kind='barh')
    plt.savefig('./output/top20_imdb_directors.png')
    plt.show()

    # Step.3.3 电影产量趋势
    df_movie_years = df_data.groupby('title_year')['movie_title'].count()
    plt.figure()
    df_movie_years.plot()
    plt.savefig('./output/movie_years.png')
    plt.show()

    # Step.4 电影类型分析
    # 电影类型个数统计
    df_genres = get_genres_data(df_data)
    genres_count = df_genres.groupby('genre').size()
    plt.figure(figsize=(15.0, 10.0))
    genres_count.plot(kind='barh')
    plt.savefig('./output/genres_count.png')
    plt.show()

    # 电影类型票房统计
    genres_gross = df_genres.groupby('genre')['gross'].sum()
    plt.figure(figsize=(15.0, 10.0))
    genres_gross.plot(kind='barh')
    plt.savefig('./output/genres_gross.png')
    plt.show()
Beispiel #6
0
def run_main():
    """
        主函数
    """
    # Step.0 加载数据
    filepath = './dataset/voice.csv'
    voice_data = pd.read_csv(filepath)

    # Step.1 查看数据
    inspect_dataset(voice_data)
    # 查看各label的数据量
    print voice_data['label'].value_counts()

    # Step.2 处理缺失数据
    voice_data = process_missing_data(voice_data)

    # Step.3 特征分布可视化
    fea_name1 = 'meanfun'
    fea_name2 = 'centroid'
    visualize_two_features(voice_data, fea_name1, fea_name2)

    visualize_single_feature(voice_data, fea_name1)

    fea_names = ['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label']
    visualize_multiple_features(voice_data, fea_names)

    # Step.4 准备数据
    X = voice_data.iloc[:, :-1].values
    voice_data['label'].replace('male', 0, inplace=True)
    voice_data['label'].replace('female', 1, inplace=True)
    y = voice_data['label'].values

    # 特征归一化
    X = preprocessing.scale(X)

    # 分割训练集、测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=5)

    # 选择模型,交叉验证
    k_range = range(1, 31)
    cv_scores = []
    print '交叉验证:'
    for k in k_range:
        knn = KNeighborsClassifier(k)
        scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
        score_mean = scores.mean()
        cv_scores.append(score_mean)
        print '%i: %.4f' % (k, score_mean)

    best_k = np.argmax(cv_scores) + 1
    print '最优K:', best_k

    plt.plot(k_range, cv_scores)
    plt.xlabel('K')
    plt.ylabel('Accuracy')
    plt.show()

    # 训练模型
    knn_model = KNeighborsClassifier(best_k)
    knn_model.fit(X_train, y_train)
    print '测试模型,准确率:', knn_model.score(X_test, y_test)