def run_main(): """ 主函数 """ # Step.0 加载数据 filepath = './dataset/Airplane_Crashes_and_Fatalities_Since_1908.csv' air_data = pd.read_csv(filepath) # Step.1 查看数据 inspect_dataset(air_data) # Step.2 数据转换 air_data = add_year_to_data(air_data) # Step.3 数据分析及可视化 # Step. 3.1 空难数vs年份分析 plot_crashes_vs_year(air_data, 'sns') plot_crashes_vs_year(air_data, 'bokeh') # Step. 3.2 乘客数量vs遇难数vs年份分析 plot_aboard_vs_fatalities_vs_year(air_data, 'sns') plot_aboard_vs_fatalities_vs_year(air_data, 'bokeh') # Step. 3 plot_top_n(air_data, 'Type', top_n=20, save_file_path='./output/top_50_type.csv', save_fig_path='./output/top_50_type.png') plot_top_n(air_data, 'Operator', top_n=20, save_file_path='./output/top_50_operator.csv', save_fig_path='./output/top_50_operator.png')
def run_main(): """ 主函数 """ ## Step.0 加载数据 df_data = pd.read_csv(dataset_path) ## Step.1 查看数据 inspect_dataset(df_data) ## Step.2 处理缺失数据 df_data = process_missing_data(df_data) ## Step.3.1 可视化战队属性,这里选4个属性作为例子展示 column_names = [ 'LeagueIndex', # 战队索引号 'HoursPerWeek', # 每周游戏时间 'Age', # 战队中玩家的年龄 'APM', # 手速 'WorkersMade' # 单位时间的建造数 ] visualize_league_attributes(df_data[column_names]) ## Step3.2 可视化战队属性统计值 visualize_league_attribute_stats( df_data[column_names], 'APM', savedata_path='./league_apm_stats.csv', savefig_path='./league_apm_stats.png', ) visualize_league_attribute_stats( df_data[column_names], 'HoursPerWeek', savedata_path='./league_hrs_stats.csv', savefig_path='./league_hrs_stats.png', )
def run_main(): """ 主函数 """ ## 解压数据集 print "解压zip...", unzip(zip_filepath, dataset_path) print "完成." ## 1. 查看数据集 df_data = pd.read_csv(dataset_filepath) inspect_dataset(df_data) ## 2. 处理缺失数据 df_data = process_missing_data(df_data) ## 3. 数据处理构建特征,并重构数据 # 获取重构“成对”数据,以便放入预测模型 pair_data, labels, features = get_pair_data(df_data) # 进行特征选择 if is_feat_select: pair_data, selected_features = select_features(pair_data, labels, features) print '选择的特征:', print selected_features n_pos_samples = labels[labels == 1].shape[0] n_neg_samples = labels[labels == 0].shape[0] print '正样本数:%d' % n_pos_samples print '负样本数:%d' % n_neg_samples # 处理非平衡数据 if is_process_unbalanced_data: pair_data, labels = balance_samples(pair_data, labels) # 分割训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( pair_data, labels, test_size=0.1, random_state=random_seed) ## 4.训练模型,测试模型 print "逻辑回归模型:" logistic_model = train_model(X_train, y_train, model_name='logistic_regression', is_cv=is_cv) logistic_model_predictions = logistic_model.predict(X_test) logistic_model_prob_predictions = logistic_model.predict_proba(X_test) # 输出预测结果 print_test_results(y_test, logistic_model_predictions, logistic_model_prob_predictions) print "支持向量机模型:" svm_model = train_model(X_train, y_train, model_name='svm', is_cv=is_cv) svm_model_predictions = svm_model.predict(X_test) svm_model_prob_predictions = svm_model.predict_proba(X_test) # 输出预测结果 print_test_results(y_test, svm_model_predictions, svm_model_prob_predictions) print "随机森林模型:" rf_model = train_model(X_train, y_train, model_name='random_forest', is_cv=is_cv) rf_model_predictios = rf_model.predict(X_test) rf_model_prob_predictios = rf_model.predict_proba(X_test) # 输出预测结果 print_test_results(y_test, rf_model_predictios, rf_model_prob_predictios) ## 5. 绘制ROC曲线 plot_roc(y_test, logistic_model_prob_predictions, fig_title='Logistic Regression', savepath='./lr_roc.png') plot_roc(y_test, svm_model_prob_predictions, fig_title='SVM', savepath='./svm_roc.png') plot_roc(y_test, rf_model_prob_predictios, fig_title='Random Forest', savepath='./rf_roc.png') # 删除解压数据,清理空间 if os.path.exists(dataset_filepath): print '分析结束,清理空间。' os.remove(dataset_filepath)
league_ser, stats_min_ser, stats_max_ser, stats_max_ser, stats_mean_ser ], axis=1) print stats_data fig = plt.figure(figsize=(10.0, 10.0)) # axs=fig.add_subplot(1,1,1) plt.xlabel(u'战队') plt.title(u'APM statics') plt.plot(stats_data['LeagueIndex'], stats_data['min'], color='green') plt.plot(stats_data['LeagueIndex'], stats_data['max'], color='red') plt.plot(stats_data['LeagueIndex'], stats_data['mean'], color='blue') blue_patch = mpatches.Patch(color='blue', label='Average ' + attr_label) green_patch = mpatches.Patch(color='green', label='Min ' + attr_label) red_patch = mpatches.Patch(color='red', label='Max ' + attr_label) plt.legend(handles=[blue_patch, red_patch, green_patch]) plt.show() return stats_data if __name__ == '__main__': filepath = 'D:\little_elephant\lecture05_codes\codes\lecture05_proj\dataset\\starcraft.csv' df_data = inspect_dataset(filepath) process_missing_data(df_data) column_names = ['LeagueIndex', 'APM'] # print df_data[column_names] status_data = visualize_league_attribute_status(df_data[column_names], 'APM')
def run_main(): """ 主函数 """ ## Step.0 加载数据 df_data = pd.read_csv(dataset_path) ## Step.1 查看数据 inspect_dataset(df_data) ## Step.2 处理缺失数据 df_data = process_missing_data(df_data) ## Step.3 使用分组统计数据集的基本信息 ## Step.3.1 查看票房收入统计 (可传入多个列名进行分析) # 导演vs票房总收入 analyze_gross(df_data, ['director_name'], './output/director_gross.csv') # 主演vs票房总收入 analyze_gross(df_data, ['actor_1_name'], './output/actor_gross.csv') # 导演+主演vs票房收入 analyze_gross(df_data, ['director_name', 'actor_1_name'], './output/director_actor_gross.csv') # Step.3.2 查看imdb评分统计 # 查看各imdb评分的电影个数 df_ratings = df_data.groupby('imdb_score')['movie_title'].count() plt.figure() df_ratings.plot() plt.savefig('./output/imdb_scores.png') plt.show() # 查看top20导演的平均imdb评分 df_director_mean_ratings = df_data.groupby( 'director_name')['imdb_score'].mean() top20_imdb_directors = df_director_mean_ratings.sort_values( ascending=False)[:20] plt.figure(figsize=(18.0, 10.0)) top20_imdb_directors.plot(kind='barh') plt.savefig('./output/top20_imdb_directors.png') plt.show() # Step.3.3 电影产量趋势 df_movie_years = df_data.groupby('title_year')['movie_title'].count() plt.figure() df_movie_years.plot() plt.savefig('./output/movie_years.png') plt.show() # Step.4 电影类型分析 # 电影类型个数统计 df_genres = get_genres_data(df_data) genres_count = df_genres.groupby('genre').size() plt.figure(figsize=(15.0, 10.0)) genres_count.plot(kind='barh') plt.savefig('./output/genres_count.png') plt.show() # 电影类型票房统计 genres_gross = df_genres.groupby('genre')['gross'].sum() plt.figure(figsize=(15.0, 10.0)) genres_gross.plot(kind='barh') plt.savefig('./output/genres_gross.png') plt.show()
def run_main(): """ 主函数 """ # Step.0 加载数据 filepath = './dataset/voice.csv' voice_data = pd.read_csv(filepath) # Step.1 查看数据 inspect_dataset(voice_data) # 查看各label的数据量 print voice_data['label'].value_counts() # Step.2 处理缺失数据 voice_data = process_missing_data(voice_data) # Step.3 特征分布可视化 fea_name1 = 'meanfun' fea_name2 = 'centroid' visualize_two_features(voice_data, fea_name1, fea_name2) visualize_single_feature(voice_data, fea_name1) fea_names = ['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label'] visualize_multiple_features(voice_data, fea_names) # Step.4 准备数据 X = voice_data.iloc[:, :-1].values voice_data['label'].replace('male', 0, inplace=True) voice_data['label'].replace('female', 1, inplace=True) y = voice_data['label'].values # 特征归一化 X = preprocessing.scale(X) # 分割训练集、测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=5) # 选择模型,交叉验证 k_range = range(1, 31) cv_scores = [] print '交叉验证:' for k in k_range: knn = KNeighborsClassifier(k) scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') score_mean = scores.mean() cv_scores.append(score_mean) print '%i: %.4f' % (k, score_mean) best_k = np.argmax(cv_scores) + 1 print '最优K:', best_k plt.plot(k_range, cv_scores) plt.xlabel('K') plt.ylabel('Accuracy') plt.show() # 训练模型 knn_model = KNeighborsClassifier(best_k) knn_model.fit(X_train, y_train) print '测试模型,准确率:', knn_model.score(X_test, y_test)