category_id -产品的类别ID category_code -产品的类别分类法(代码名称) brand -品牌名称 price -产品价格 user_id -用户ID ''' # %% # 1.读取文件,查看基本信息 file_path = r'C:\Users\bolat\Desktop\file\DataAnalysis\archive.zip' df = u.unpack_file_to_df(file_path) df = df['kz'] u.show_info(df) print(df.shape) u.show_na(df) u.show_duplicated(df) print(df.describe()) # %% # 修正各字段 # 1.删除重复记录 df.drop_duplicates(inplace=True, ignore_index=True) # %% # 2.修改event_time字段为日期属性,并添加月字段 df['event_time'] = pd.to_datetime(df['event_time']) df['month'] = df['event_time'].dt.month # %% df.info() print(df.head())
import pandas as pd from matplotlib import pyplot as plt from common import utils as u import numpy as np # %% file_path = r'D:\file\DataAnalysis\douyin.zip' df = pd.read_csv(file_path) # %% # print(df.head()) print(df.shape) print(u.show_info(df)) # %% print(u.show_na(df)) # %% print(u.show_duplicated(df)) # %% print(df.columns) # %% df.rename(columns={'Unnamed: 0': 'id'}, inplace=True) df['real_time'] = pd.to_datetime(df['real_time'], format='%Y-%m-%d %H:%M:%S') df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') df['date'] = df['date'].dt.date # %% # 日播放量,日用户量,日作者量,日作品量
import pyecharts.options as opts import seaborn as sns from matplotlib import pyplot as plt from pyecharts.charts import * from pyecharts.faker import Faker from common import utils # %% # 导入数据 file_path = r'数据分析项目/database/bicycle_sharing.csv' df = pd.read_csv(file_path) df.head() # 数据清洗,异常值处理 print(utils.show_na(df)) # 各字段不存在NA值 print(utils.show_info(df)) # 不存在异常值,但需要修改datetime字段数据类型 print(utils.show_duplicated(df)) # 不存在重复记录 """源数据经查不需要进行清洗""" # 提取数据特征 df['datetime'] = pd.to_datetime(df['datetime']) df['year'] = df['datetime'].dt.year df['month'] = df['datetime'].dt.month df['day'] = df['datetime'].dt.day df['hour'] = df['datetime'].dt.hour df['minute'] = df['datetime'].dt.minute sns.set_style('ticks') plt.figure(figsize=(12, 6)) # 利用kdeplot查看数据的分布特征
file_path = r'C:\Users\bolat\Desktop\file\DataAnalysis\2019B1631.zip' # %% df_dict = u.unpack_file_to_df(file_path) data1 = df_dict['data1'] data2 = df_dict['data2'] data3 = df_dict['data3'] # %% # 数据基本检查 print(u.show_info(data1)) print(u.show_info(data2)) print(u.show_info(data3)) # %% # 检查重复值,NA值 print(u.show_na(data1)) print(u.show_duplicated(data1)) print(u.show_na(data2)) print(u.show_duplicated(data2)) print(u.show_na(data3)) print(u.show_duplicated(data3)) # %% # 列重命名 data1.columns = ['序号', '校园卡号', '性别', '专业名称', '门禁卡号'] print(data1.head()) # %% # 处理data2的数据 # 列重命名
import pandas as pd import common.utils as utils # %% file_path = r'数据分析项目/date_file/ab_data.csv' df = pd.read_csv(file_path) # print(df.head()) print(utils.show_na(df)) # 检查NA值 print(utils.show_info(df)) # 检查数据极值 print(utils.show_duplicated(df)) # 检查重复值情况 # %% # 新旧页面转化率 control_isconverted_num = df[(df['landing_page'] == 'old_page') & (df['converted'] == 1) & (df['group'] == 'control')]['user_id'].nunique() control_view_num = df[(df['landing_page'] == 'old_page') & (df['group'] == 'control')]['user_id'].count() control_tr = round((control_isconverted_num * 100 / control_view_num), 4) control = { '转换数': control_isconverted_num, '浏览数': control_view_num, '转化率': str(control_tr) + '%' } treatment_isconverted_num = df[(df['landing_page'] == 'new_page') & (df['converted'] == 1) & (df['group']
import seaborn as sns from pyecharts.charts import * from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score # 导入轮廓系数指标 from sklearn.preprocessing import MinMaxScaler, OneHotEncoder import common.utils as utils file_path = r'数据分析项目/database/ad_performance.csv' pd.set_option('display.max_columns', None) # 设置显示所有字段 df = pd.read_csv(file_path) df.drop(columns=['Unnamed: 0'], inplace=True) utils.fix_matplotlib_error() # 解决matplotlib中的中文乱码问题 # 检查数据的NA值,描述性统计,重复值情况 print(utils.show_na(df)) print(utils.show_info(df)) # 平均停留时间字段存在2个缺失值 print(utils.show_duplicated(df)) # %% # 缺失值数据查看 df[df['平均停留时间'].isna()] # 删除包含缺失值的记录 df = df[~df['平均停留时间'].isna()] df.reset_index(inplace=True, drop=True) # 各字段相关性展示 corr = df.corr().round(2) sns.heatmap(corr, cmap='Blues', annot=True) plt.show()