Ejemplo n.º 1
0
category_id -产品的类别ID
category_code -产品的类别分类法(代码名称)
brand -品牌名称
price -产品价格
user_id -用户ID
'''

# %%
# 1.读取文件,查看基本信息
file_path = r'C:\Users\bolat\Desktop\file\DataAnalysis\archive.zip'

df = u.unpack_file_to_df(file_path)
df = df['kz']
u.show_info(df)
print(df.shape)
u.show_na(df)
u.show_duplicated(df)
print(df.describe())

# %%
# 修正各字段
# 1.删除重复记录
df.drop_duplicates(inplace=True, ignore_index=True)
# %%
# 2.修改event_time字段为日期属性,并添加月字段
df['event_time'] = pd.to_datetime(df['event_time'])
df['month'] = df['event_time'].dt.month

# %%
df.info()
print(df.head())
Ejemplo n.º 2
0
import pandas as pd
from matplotlib import pyplot as plt
from common import utils as u
import numpy as np

# %%
file_path = r'D:\file\DataAnalysis\douyin.zip'
df = pd.read_csv(file_path)
# %%
# print(df.head())
print(df.shape)
print(u.show_info(df))

# %%
print(u.show_na(df))

# %%
print(u.show_duplicated(df))

# %%
print(df.columns)
# %%
df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
df['real_time'] = pd.to_datetime(df['real_time'], format='%Y-%m-%d %H:%M:%S')
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['date'] = df['date'].dt.date


# %%
# 日播放量,日用户量,日作者量,日作品量
Ejemplo n.º 3
0
import pyecharts.options as opts
import seaborn as sns
from matplotlib import pyplot as plt
from pyecharts.charts import *
from pyecharts.faker import Faker

from common import utils

# %%
# 导入数据
file_path = r'数据分析项目/database/bicycle_sharing.csv'
df = pd.read_csv(file_path)
df.head()

# 数据清洗,异常值处理
print(utils.show_na(df))  # 各字段不存在NA值
print(utils.show_info(df))  # 不存在异常值,但需要修改datetime字段数据类型
print(utils.show_duplicated(df))  # 不存在重复记录
"""源数据经查不需要进行清洗"""

# 提取数据特征
df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute

sns.set_style('ticks')
plt.figure(figsize=(12, 6))
# 利用kdeplot查看数据的分布特征
Ejemplo n.º 4
0
file_path = r'C:\Users\bolat\Desktop\file\DataAnalysis\2019B1631.zip'
# %%
df_dict = u.unpack_file_to_df(file_path)
data1 = df_dict['data1']
data2 = df_dict['data2']
data3 = df_dict['data3']

# %%
# 数据基本检查
print(u.show_info(data1))
print(u.show_info(data2))
print(u.show_info(data3))

# %%
# 检查重复值,NA值
print(u.show_na(data1))
print(u.show_duplicated(data1))
print(u.show_na(data2))
print(u.show_duplicated(data2))
print(u.show_na(data3))
print(u.show_duplicated(data3))

# %%
# 列重命名
data1.columns = ['序号', '校园卡号', '性别', '专业名称', '门禁卡号']
print(data1.head())

# %%
# 处理data2的数据

# 列重命名
Ejemplo n.º 5
0
import pandas as pd
import common.utils as utils

# %%

file_path = r'数据分析项目/date_file/ab_data.csv'

df = pd.read_csv(file_path)
# print(df.head())
print(utils.show_na(df))  # 检查NA值
print(utils.show_info(df))  # 检查数据极值
print(utils.show_duplicated(df))  # 检查重复值情况

# %%
# 新旧页面转化率
control_isconverted_num = df[(df['landing_page'] == 'old_page')
                             & (df['converted'] == 1) &
                             (df['group'] == 'control')]['user_id'].nunique()
control_view_num = df[(df['landing_page'] == 'old_page')
                      & (df['group'] == 'control')]['user_id'].count()
control_tr = round((control_isconverted_num * 100 / control_view_num), 4)

control = {
    '转换数': control_isconverted_num,
    '浏览数': control_view_num,
    '转化率': str(control_tr) + '%'
}

treatment_isconverted_num = df[(df['landing_page'] == 'new_page')
                               & (df['converted'] == 1) &
                               (df['group']
Ejemplo n.º 6
0
import seaborn as sns
from pyecharts.charts import *
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score  # 导入轮廓系数指标
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import common.utils as utils

file_path = r'数据分析项目/database/ad_performance.csv'
pd.set_option('display.max_columns', None)  # 设置显示所有字段
df = pd.read_csv(file_path)
df.drop(columns=['Unnamed: 0'], inplace=True)
utils.fix_matplotlib_error()  # 解决matplotlib中的中文乱码问题

# 检查数据的NA值,描述性统计,重复值情况
print(utils.show_na(df))
print(utils.show_info(df))  # 平均停留时间字段存在2个缺失值
print(utils.show_duplicated(df))

# %%
# 缺失值数据查看
df[df['平均停留时间'].isna()]

# 删除包含缺失值的记录
df = df[~df['平均停留时间'].isna()]
df.reset_index(inplace=True, drop=True)

# 各字段相关性展示
corr = df.corr().round(2)
sns.heatmap(corr, cmap='Blues', annot=True)
plt.show()