Beispiel #1
0
        return None


df_by_seasons = pd.pivot_table(data=df,
                               index='hour',
                               columns='season',
                               values=['casual', 'registered', 'count'],
                               aggfunc='sum',
                               fill_value=0)

df_by_seasons = df_by_seasons.stack().reset_index()
df_by_seasons = pd.melt(df_by_seasons, id_vars=['hour', 'season'])

df_by_seasons = df_by_seasons[df_by_seasons['variable'] == 'count']
df_by_seasons['season'] = df_by_seasons['season'].apply(set_season)
utils.fix_matplotlib_error()
plt.figure(figsize=(12, 9))
plt.xticks(range(0, 24))
plt.title('Demand Tendency by season')
plt.grid()
sns.lineplot(data=df_by_seasons,
             x='hour',
             y='value',
             hue='season',
             linestyle='-.',
             markers=True,
             palette='Blues')
plt.show()

# 一周不同时间段的骑行需求
# 从日期中提取出该日属于星期几
Beispiel #2
0
# 修正各字段
# 1.删除重复记录
df.drop_duplicates(inplace=True, ignore_index=True)
# %%
# 2.修改event_time字段为日期属性,并添加月字段
df['event_time'] = pd.to_datetime(df['event_time'])
df['month'] = df['event_time'].dt.month

# %%
df.info()
print(df.head())

# %%
# 1、进行用户消费趋势分析(按月)
# 调用utils中的函数,修正plt中绘图的错误
u.fix_matplotlib_error()

# %%
df_month = df.groupby('month', as_index=False)
df_month_sum = df_month.sum()

# %%
plt.figure(figsize=(10, 6))
plt.subplot(2, 1, 1)
sns.lineplot(x='month', y='price', data=df_month_sum)
plt.title('每月消费总金额')
plt.xticks([i for i in range(1, 13)])
plt.subplot(2, 1, 2)
df_month_count = df_month.count()
sns.lineplot(x='month', y='user_id', data=df_month_count)
plt.title('每月消费人数')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pyecharts.charts import *
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score  # 导入轮廓系数指标
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import common.utils as utils

file_path = r'数据分析项目/database/ad_performance.csv'
pd.set_option('display.max_columns', None)  # 设置显示所有字段
df = pd.read_csv(file_path)
df.drop(columns=['Unnamed: 0'], inplace=True)
utils.fix_matplotlib_error()  # 解决matplotlib中的中文乱码问题

# 检查数据的NA值,描述性统计,重复值情况
print(utils.show_na(df))
print(utils.show_info(df))  # 平均停留时间字段存在2个缺失值
print(utils.show_duplicated(df))

# %%
# 缺失值数据查看
df[df['平均停留时间'].isna()]

# 删除包含缺失值的记录
df = df[~df['平均停留时间'].isna()]
df.reset_index(inplace=True, drop=True)

# 各字段相关性展示