import pandas as pd import pathhelper as path # 读取csv txt students1 = pd.read_csv(path.join_curr('Students.csv'), index_col='ID') print(students1) # 无论是tsv,csv,txt 都用 read_csv 读取,但是要指定 分隔字符 students2 = pd.read_csv(path.join_curr('Students.txt'), # 分隔字符 sep='\t', index_col='ID' ) print(students2)
import pandas as pd import pathhelper as path # 多表查询 students = pd.read_excel(path.join_curr('Students4.xlsx'), sheet_name='Students') scores = pd.read_excel(path.join_curr('Students4.xlsx'), sheet_name='Scores') # 拿 ID 列 连接成一张表 # 使用 merge 函数 table = students.merge( scores, how='left', # 保留左表(左连接 ; 默认是 innerJoin # 联合的列的指定 # on='ID', # 以ID列 join # 当作有两张表要联合的列名字不一样时,可以这样 left_on='ID', right_on='ID' # 也可以指定这一列的数据 # left_on=students.index, right_on=scores.index # 当你什么都不指定时,merge函数会自己找左右两张表名字一样的 Series ).fillna(0) # 也可以使用 join 函数,用法和 merge 相似 # join 函数默认 联合的列 就是 index 列 students = students.set_index('ID') scores = scores.set_index('ID')
# 数据校验 def score_validation(row): """ 验证一行数据的乘成绩是否合格 :param row: """ # 使用assert语句 # try: # assert 0 <= row.Score <= 100 # except: # print(f'#{row.ID} student {row.Student} has as invalid score {row.Score}.') # 使用if语句 if not 0 <= row.Score <= 100: print( f'#{row.ID} student {row.Student} has as invalid score {row.Score}.' ) students = pd.read_excel(path.join_curr('Students5.xlsx')) print(students) students.apply( score_validation, # DataFrame 有两个方向, 从上到下 轴是1 从左到右 轴是2 axis=1)
import pandas as pd import matplotlib.pyplot as plt import pathhelper as path # 分组柱图 students = pd.read_excel(path.join_curr('Students2.xlsx'), index_col='ID') students.sort_values(by='Score2018', inplace=True, ascending=False) print(students) students.plot.bar( x='Student', # 分组数据放在集合中 y=['Score2017', 'Score2018'], # 2017orange 2018red color=['orange', 'red']) plt.title('StudentByScore', fontsize=16, fontweight='bold') plt.xlabel('Student', fontweight='bold') plt.ylabel('Score', fontweight='bold') # 修改轴信息 ax = plt.gca() ax.set_xticklabels( students['Student'], rotation=45, # 水平对齐 ha='right') # 修改图形信息 f = plt.gcf() f.subplots_adjust(left=0.2, bottom=0.42)
import pandas as pd import matplotlib.pyplot as plt from scipy.stats import linregress import pathhelper as path # 线性回归 sales = pd.read_excel(path.join_curr('Sales.xlsx'), dtype={'Month': str}) print(sales) # 回归方程 # slope 斜率 intercept y轴截距 slope, intercept, r, p, std_err = linregress(sales.index, sales['Revenue']) exp = sales.index * slope + intercept # 柱状图 plt.scatter(sales.index, sales['Revenue']) plt.plot(sales.index, exp, color='orange') plt.title(f"y={slope}*x+{intercept}") plt.xticks(sales.index, sales['Month'], rotation=90) plt.tight_layout() plt.show()
""" # 要加上的月份是否超过1年 year_delta = month_delta // 12 # 月份要加上不足一年的部分,注意这里可能大于12,需要再判断一次 month = g_date.month + month_delta % 12 if month != 12: year_delta += month // 12 month = month % 12 return date(g_date.year + year_delta, month, g_date.day) # 打开Excel文件夹 books = pd.read_excel( path.join_curr('Books.xlsx'), # 跳过空行 skiprows=9, # 取哪几列,不分大小写 usecols="j:p", # index index_col=None, # 设置列数据的默认类型,大小写敏感 dtype={ 'ID': str, 'InStore': str, 'Date': str }) # 循环填充单元格
import pandas as pd import pathhelper as path # 排序 多重排序 products = pd.read_excel(path.join_curr('List.xlsx'), index_col='ID') # 按价格排序 products.sort_values( # 需要排序的列名 by='Price', # 为True时,仅在当前 DataFrame 中排序,不会生成新的 inplace=True, # 默认为True,表示小到大 (boolean类型,给0 1 也行) ascending=False) # 如果需要同时以两个维度排序,不能连续用两次 sort_values # 只能在一次 sort_values 按两个维度排 products.sort_values( # 先按值不值排序,再按价格排序 by=['Worthy', 'Price'], inplace=True, # ascending 参数与上面 by 对应 ascending=[True, False]) print(products)
import pandas as pd import matplotlib.pyplot as plt import pathhelper as path # 散点图 直方图 # 显示最大列数 pd.options.display.max_columns = 10 homes = pd.read_excel(path.join_curr('Home_datas.xlsx')) print(homes.head()) # 散点图 # homes.plot.scatter(x='price', y='sqft_living') # plt.show() # 直方图 # homes['sqft_living'].plot.hist( # # bins=100 # 桶的个数 # # ) # # # 设置横轴范围(从0到最大值,步长500) # # plt.xticks(range(0, max(homes['sqft_living']), 500), fontsize=8, rotation=90) # # plt.show() # 密度图 homes['sqft_living'].plot.kde() plt.xticks(range(0, max(homes['sqft_living']), 500), fontsize=8, rotation=90) plt.show() # 其他 pandas 的数据分析功能
import pandas as pd import numpy as np import pathhelper as path # 数据透视表 pd.options.display.max_columns = 999 orders = pd.read_excel(path.join_curr('Orders.xlsx')) # 新增一列 orders['Year'] = pd.DatetimeIndex(orders['Date']).year print(orders) print(orders['Date'].dtype) # 透视表 # pivot_table print('-' * 50) pt1 = orders.pivot_table(index='Category', columns='Year', values='Total', aggfunc=np.sum) print(pt1) # group by print('-' * 50) groups = orders.groupby(['Category', 'Year']) s = groups['Total'].sum() c = groups['ID'].count() pt2 = pd.DataFrame({'Sum': s, 'Count': c}) print(pt2)
import pandas as pd import matplotlib.pyplot as plt import pathhelper as path # 叠加柱状图 横向柱状图 users = pd.read_excel(path.join_curr('Users.xlsx'), index_col='ID') # 排序,用增加一列(三个月之和)的数据 users['Total'] = users['Oct'] + users['Nov'] + users['Dec'] users.sort_values(by='Total', inplace=True, ascending=True) print(users) users.plot.barh(x='Name', y=['Oct', 'Nov', 'Dec'], stacked=True, # 叠加柱状图 title='User Behavior' ) plt.tight_layout() plt.show()
import pandas as pd import pathhelper as path # 行列转换 pd.options.display.max_columns = 12 videos = pd.read_excel(path.join_curr('Videos.xlsx'), index_col='Mouth') table = videos.transpose() print(table)
import pandas as pd import matplotlib.pyplot as plt import pathhelper as path # 饼图 students = pd.read_excel(path.join_curr('Students3.xlsx'), index_col='From') print(students) print(students.columns) students['2017'].plot.pie(fontsize=8, counterclock=False, # 顺时针排 startangle=-270 # 开始角度 ) plt.title('Score of Students', fontsize=16, fontweight='bold') plt.ylabel('2017', fontsize=16, fontweight='bold') plt.show()
import pandas as pd import matplotlib.pyplot as plt import pathhelper as path # 折线图 叠加区域图 products = pd.read_excel(path.join_curr('Products.xlsx'), index_col='Week') print(products) products.plot.area(y=['Part1', 'Part2', 'Part3', 'Part4']) plt.title('Sales Weekly Trend', fontsize=16, fontweight='bold') plt.ylabel('Total', fontsize=12, fontweight='bold') plt.xticks(products.index, fontsize=8) plt.show()
import pandas as pd import pathhelper as path # 将一行数据分为两行 employees = pd.read_excel(path.join_curr('Employees.xlsx')) # 分两列 df = employees['Full Name'].str.split( # 用什么字符串切割字符串,默认为空 ' ', # 最多保留多少个切出来的子字符串(如果是-1 或 0 ,意思是全部保留)(本例就是只能切出两个) n=2, # 拆分的时候直接就分为两列,默认为False expand=True) print(df) # 讲df加到 最初的表格中 employees['First Name'] = df[0] employees['Last Name'] = df[1].str.upper() print(employees)