Ejemplo n.º 1
0
import pandas as pd
import pathhelper as path

# 读取csv txt

students1 = pd.read_csv(path.join_curr('Students.csv'), index_col='ID')
print(students1)

# 无论是tsv,csv,txt 都用 read_csv 读取,但是要指定 分隔字符
students2 = pd.read_csv(path.join_curr('Students.txt'),
                        # 分隔字符
                        sep='\t',
                        index_col='ID'
                        )
print(students2)
Ejemplo n.º 2
0
import pandas as pd
import pathhelper as path

# 多表查询

students = pd.read_excel(path.join_curr('Students4.xlsx'),
                         sheet_name='Students')
scores = pd.read_excel(path.join_curr('Students4.xlsx'), sheet_name='Scores')

# 拿 ID 列 连接成一张表

# 使用 merge 函数
table = students.merge(
    scores,
    how='left',  # 保留左表(左连接 ; 默认是 innerJoin

    # 联合的列的指定

    # on='ID',  # 以ID列 join
    # 当作有两张表要联合的列名字不一样时,可以这样
    left_on='ID',
    right_on='ID'
    # 也可以指定这一列的数据
    # left_on=students.index, right_on=scores.index
    # 当你什么都不指定时,merge函数会自己找左右两张表名字一样的 Series
).fillna(0)

# 也可以使用 join 函数,用法和 merge 相似
# join 函数默认 联合的列 就是 index 列
students = students.set_index('ID')
scores = scores.set_index('ID')
Ejemplo n.º 3
0
# 数据校验


def score_validation(row):
    """
    验证一行数据的乘成绩是否合格
    :param row:
    """

    # 使用assert语句
    # try:
    #     assert 0 <= row.Score <= 100
    # except:
    #     print(f'#{row.ID} student {row.Student} has as invalid score {row.Score}.')

    # 使用if语句
    if not 0 <= row.Score <= 100:
        print(
            f'#{row.ID} student {row.Student} has as invalid score {row.Score}.'
        )


students = pd.read_excel(path.join_curr('Students5.xlsx'))
print(students)

students.apply(
    score_validation,
    # DataFrame 有两个方向, 从上到下 轴是1 从左到右 轴是2
    axis=1)
Ejemplo n.º 4
0
import pandas as pd
import matplotlib.pyplot as plt
import pathhelper as path

# 分组柱图
students = pd.read_excel(path.join_curr('Students2.xlsx'), index_col='ID')
students.sort_values(by='Score2018', inplace=True, ascending=False)
print(students)

students.plot.bar(
    x='Student',
    # 分组数据放在集合中
    y=['Score2017', 'Score2018'],
    # 2017orange 2018red
    color=['orange', 'red'])

plt.title('StudentByScore', fontsize=16, fontweight='bold')
plt.xlabel('Student', fontweight='bold')
plt.ylabel('Score', fontweight='bold')

# 修改轴信息
ax = plt.gca()
ax.set_xticklabels(
    students['Student'],
    rotation=45,
    # 水平对齐
    ha='right')

# 修改图形信息
f = plt.gcf()
f.subplots_adjust(left=0.2, bottom=0.42)
Ejemplo n.º 5
0
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
import pathhelper as path

# 线性回归

sales = pd.read_excel(path.join_curr('Sales.xlsx'), dtype={'Month': str})
print(sales)

# 回归方程
# slope 斜率  intercept y轴截距
slope, intercept, r, p, std_err = linregress(sales.index, sales['Revenue'])

exp = sales.index * slope + intercept

# 柱状图
plt.scatter(sales.index, sales['Revenue'])
plt.plot(sales.index, exp, color='orange')
plt.title(f"y={slope}*x+{intercept}")
plt.xticks(sales.index, sales['Month'], rotation=90)
plt.tight_layout()
plt.show()
Ejemplo n.º 6
0
    """
    # 要加上的月份是否超过1年
    year_delta = month_delta // 12

    # 月份要加上不足一年的部分,注意这里可能大于12,需要再判断一次
    month = g_date.month + month_delta % 12
    if month != 12:
        year_delta += month // 12
        month = month % 12

    return date(g_date.year + year_delta, month, g_date.day)


# 打开Excel文件夹
books = pd.read_excel(
    path.join_curr('Books.xlsx'),
    # 跳过空行
    skiprows=9,
    # 取哪几列,不分大小写
    usecols="j:p",
    # index
    index_col=None,
    # 设置列数据的默认类型,大小写敏感
    dtype={
        'ID': str,
        'InStore': str,
        'Date': str
    })

# 循环填充单元格
Ejemplo n.º 7
0
import pandas as pd
import pathhelper as path

# 排序 多重排序

products = pd.read_excel(path.join_curr('List.xlsx'), index_col='ID')

# 按价格排序
products.sort_values(
    # 需要排序的列名
    by='Price',
    # 为True时,仅在当前 DataFrame 中排序,不会生成新的
    inplace=True,
    # 默认为True,表示小到大 (boolean类型,给0 1 也行)
    ascending=False)

# 如果需要同时以两个维度排序,不能连续用两次 sort_values
# 只能在一次 sort_values 按两个维度排
products.sort_values(
    # 先按值不值排序,再按价格排序
    by=['Worthy', 'Price'],
    inplace=True,
    # ascending 参数与上面 by 对应
    ascending=[True, False])

print(products)
Ejemplo n.º 8
0
import pandas as pd
import matplotlib.pyplot as plt
import pathhelper as path

# 散点图 直方图

# 显示最大列数
pd.options.display.max_columns = 10

homes = pd.read_excel(path.join_curr('Home_datas.xlsx'))
print(homes.head())

# 散点图
# homes.plot.scatter(x='price', y='sqft_living')
# plt.show()

# 直方图
# homes['sqft_living'].plot.hist(
# #     bins=100  # 桶的个数
# # )
# # # 设置横轴范围(从0到最大值,步长500)
# # plt.xticks(range(0, max(homes['sqft_living']), 500), fontsize=8, rotation=90)
# # plt.show()

# 密度图
homes['sqft_living'].plot.kde()
plt.xticks(range(0, max(homes['sqft_living']), 500), fontsize=8, rotation=90)
plt.show()


# 其他 pandas 的数据分析功能
Ejemplo n.º 9
0
import pandas as pd
import numpy as np
import pathhelper as path

# 数据透视表

pd.options.display.max_columns = 999
orders = pd.read_excel(path.join_curr('Orders.xlsx'))
# 新增一列
orders['Year'] = pd.DatetimeIndex(orders['Date']).year
print(orders)
print(orders['Date'].dtype)

# 透视表

# pivot_table
print('-' * 50)
pt1 = orders.pivot_table(index='Category',
                         columns='Year',
                         values='Total',
                         aggfunc=np.sum)
print(pt1)

# group by
print('-' * 50)
groups = orders.groupby(['Category', 'Year'])
s = groups['Total'].sum()
c = groups['ID'].count()

pt2 = pd.DataFrame({'Sum': s, 'Count': c})
print(pt2)
Ejemplo n.º 10
0
import pandas as pd
import matplotlib.pyplot as plt
import pathhelper as path

# 叠加柱状图 横向柱状图
users = pd.read_excel(path.join_curr('Users.xlsx'), index_col='ID')
# 排序,用增加一列(三个月之和)的数据
users['Total'] = users['Oct'] + users['Nov'] + users['Dec']
users.sort_values(by='Total', inplace=True, ascending=True)
print(users)

users.plot.barh(x='Name', y=['Oct', 'Nov', 'Dec'],
                stacked=True,  # 叠加柱状图
                title='User Behavior'
                )

plt.tight_layout()
plt.show()
Ejemplo n.º 11
0
import pandas as pd
import pathhelper as path

# 行列转换

pd.options.display.max_columns = 12
videos = pd.read_excel(path.join_curr('Videos.xlsx'), index_col='Mouth')
table = videos.transpose()
print(table)
Ejemplo n.º 12
0
import pandas as pd
import matplotlib.pyplot as plt
import pathhelper as path

# 饼图
students = pd.read_excel(path.join_curr('Students3.xlsx'), index_col='From')
print(students)
print(students.columns)

students['2017'].plot.pie(fontsize=8,
                          counterclock=False,  # 顺时针排
                          startangle=-270  # 开始角度
                          )
plt.title('Score of Students', fontsize=16, fontweight='bold')
plt.ylabel('2017', fontsize=16, fontweight='bold')
plt.show()
Ejemplo n.º 13
0
import pandas as pd
import matplotlib.pyplot as plt
import pathhelper as path

# 折线图 叠加区域图
products = pd.read_excel(path.join_curr('Products.xlsx'), index_col='Week')
print(products)

products.plot.area(y=['Part1', 'Part2', 'Part3', 'Part4'])
plt.title('Sales Weekly Trend', fontsize=16, fontweight='bold')
plt.ylabel('Total', fontsize=12, fontweight='bold')
plt.xticks(products.index, fontsize=8)
plt.show()
Ejemplo n.º 14
0
import pandas as pd
import pathhelper as path

# 将一行数据分为两行

employees = pd.read_excel(path.join_curr('Employees.xlsx'))
# 分两列
df = employees['Full Name'].str.split(
    # 用什么字符串切割字符串,默认为空
    ' ',
    # 最多保留多少个切出来的子字符串(如果是-1 或 0 ,意思是全部保留)(本例就是只能切出两个)
    n=2,
    # 拆分的时候直接就分为两列,默认为False
    expand=True)
print(df)
# 讲df加到 最初的表格中
employees['First Name'] = df[0]
employees['Last Name'] = df[1].str.upper()

print(employees)