return None df_by_seasons = pd.pivot_table(data=df, index='hour', columns='season', values=['casual', 'registered', 'count'], aggfunc='sum', fill_value=0) df_by_seasons = df_by_seasons.stack().reset_index() df_by_seasons = pd.melt(df_by_seasons, id_vars=['hour', 'season']) df_by_seasons = df_by_seasons[df_by_seasons['variable'] == 'count'] df_by_seasons['season'] = df_by_seasons['season'].apply(set_season) utils.fix_matplotlib_error() plt.figure(figsize=(12, 9)) plt.xticks(range(0, 24)) plt.title('Demand Tendency by season') plt.grid() sns.lineplot(data=df_by_seasons, x='hour', y='value', hue='season', linestyle='-.', markers=True, palette='Blues') plt.show() # 一周不同时间段的骑行需求 # 从日期中提取出该日属于星期几
# 修正各字段 # 1.删除重复记录 df.drop_duplicates(inplace=True, ignore_index=True) # %% # 2.修改event_time字段为日期属性,并添加月字段 df['event_time'] = pd.to_datetime(df['event_time']) df['month'] = df['event_time'].dt.month # %% df.info() print(df.head()) # %% # 1、进行用户消费趋势分析(按月) # 调用utils中的函数,修正plt中绘图的错误 u.fix_matplotlib_error() # %% df_month = df.groupby('month', as_index=False) df_month_sum = df_month.sum() # %% plt.figure(figsize=(10, 6)) plt.subplot(2, 1, 1) sns.lineplot(x='month', y='price', data=df_month_sum) plt.title('每月消费总金额') plt.xticks([i for i in range(1, 13)]) plt.subplot(2, 1, 2) df_month_count = df_month.count() sns.lineplot(x='month', y='user_id', data=df_month_count) plt.title('每月消费人数')
import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from pyecharts.charts import * from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score # 导入轮廓系数指标 from sklearn.preprocessing import MinMaxScaler, OneHotEncoder import common.utils as utils file_path = r'数据分析项目/database/ad_performance.csv' pd.set_option('display.max_columns', None) # 设置显示所有字段 df = pd.read_csv(file_path) df.drop(columns=['Unnamed: 0'], inplace=True) utils.fix_matplotlib_error() # 解决matplotlib中的中文乱码问题 # 检查数据的NA值,描述性统计,重复值情况 print(utils.show_na(df)) print(utils.show_info(df)) # 平均停留时间字段存在2个缺失值 print(utils.show_duplicated(df)) # %% # 缺失值数据查看 df[df['平均停留时间'].isna()] # 删除包含缺失值的记录 df = df[~df['平均停留时间'].isna()] df.reset_index(inplace=True, drop=True) # 各字段相关性展示