Esempio n. 1
0
import pandas as pd
import matplotlib.pyplot as plt

from src.utils.utils import utils

# header指定用哪一行作列名, index_col指定用作索引的列
# https://blog.csdn.net/weixin_38546295/article/details/83537558
df_pages = pd.read_excel(utils.getDataDir("/pandas/头马助手使用分析_页面分析.xlsx"),
                         header=2,
                         index_col=0)
print(df_pages.dtypes)
print(df_pages)

# 百分比转化为float类型用于排序
df_pages_quit = df_pages["退出率"].str.strip('%').astype(float) / 100
print(df_pages_quit.sort_values(ascending=False))
Esempio n. 2
0
    pb = NSPasteboard.generalPasteboard()
    pb.clearContents()
    # TODO 如果传入图片
    a = NSArray.arrayWithObject_(text)
    return pb.writeObjects_(a)


# 将bytes数据还原为图片
def image_from_bytes(raw):
    from_img = Image.frombytes(img.mode, img.size, raw)
    return from_img


if __name__ == '__main__':
    text = "激战"
    setClip(text)
    # bytes字节符,打印以b开头。
    # encode() will result in a sequence of bytes. decode() decodes a stream of bytes to a string object
    # bytes([source[, encoding[, errors]]]): source to initialize the array of bytes. if source is a string, the encoding of the string.
    print(bytes(get_paste_img_file()).decode("utf-8"))

    data = get_paste_img_file()
    writeToFile(data)

    # 将bytes数据还原为图片
    # TODO 如何将剪切板获取的数据转成合适的bytes
    img = Image.open(utils.getDataDir("/images/gushi.jpeg"))
    raw = img.tobytes()

    img_out = image_from_bytes(raw)
    img_out.show()
Esempio n. 3
0
import pandas as pd

from src.utils.utils import utils

# https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html#min-tut-03-subset

titanic = pd.read_csv(utils.getDataDir("/pandas/titanic.csv"))

# To select a single column, use square brackets
print(titanic["Age"])
print(titanic["Age"].max())
print(titanic["Age"].min())

# Have a look at the shape of titanic["Age"]
# shape is an attribute of a pandas Series so we don't need to add round brackets
print(titanic["Age"].shape)

# We can select two columns at the same time, use a list of column names within the selection brackets []
print(titanic[["Age", "Sex"]])
print(titanic[["Age", "Sex"]].shape)

# To select rows based on a conditional expression, use a condition inside the selection brackets [].
print(titanic[titanic["Age"] > 35])

# Another conditional expression
print(titanic[titanic["Pclass"].isin([2, 3])])
# When combining multiple conditional statements, each condition must be surrounded by parentheses ().
# Moreover, you can not use or/and but need to use the or operator | and the and operator &.
print(titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)])

# To select rows if Age is not NA
Esempio n. 4
0
import pandas as pd

from src.utils.utils import utils
import matplotlib.pyplot as plt

pd.set_option('display.max_columns',200) #设置显示列数
pd.set_option('display.max_rows',10) #设置显示行数

df = pd.read_csv(utils.getDataDir("/pandas/content_search_query2020-04-08.csv"))
print(df)

# 按照预测行业进行统计
# 'DataFrame' object has no attribute 'value_counts', only 'Series' has.
print(df["预测行业"].value_counts())

# 筛选出买家数>0的数据
# print(df[df["买家数"] > 0])

# 筛选出曝光点击率 > 0.3的查询词
df["曝光点击率"] = df["曝光点击率"].str.strip('%').astype(float)/100
# print(df["曝光点击率"])
# print(df[df["曝光点击率"] > 0.3])

# 计算搜索词的平均uv
# print(df["搜索uv"].mean())

print(df[df["曝光点击率"] > 0.3])
df[df["曝光点击率"] > 0.3].plot()
plt.show()

df = df.rename(columns={"搜索uv": "search uv", "搜索pv": "search pv"})
Esempio n. 5
0
import pandas as pd
import matplotlib.pyplot as plt

from src.utils.utils import utils

# https://pandas.pydata.org/docs/getting_started/intro_tutorials/04_plotting.html

# Column(s) to use as the row labels of the DataFrame, either given as string name or column index.
# Note: index_col=False can be used to force pandas to not use the first column as the index
# index_col=0 define the first (0th) column as index of the resulting DataFrame
# parse_dates convert the dates in the column to Timestamp objects, respectively

# air_quality = pd.read_csv(utils.getDataDir("/pandas/air_quality_no2.csv"), parse_dates=True)
air_quality = pd.read_csv(utils.getDataDir("/pandas/air_quality_no2.csv"),
                          index_col=0,
                          parse_dates=True)
# print(air_quality)
print(air_quality)
print(air_quality.dtypes)

# a quick visual check of the data.
air_quality.plot()
plt.show()

# plot only the columns of the data table with the data from Paris.
air_quality["station_paris"].plot()
plt.show()

# Visually compare the 𝑁02 values measured in London versus Paris.
# Scatter create a scatter plot(散点图)with varying marker point size and color.
air_quality.plot.scatter(x="station_london", y="station_paris", alpha=0.5)
Esempio n. 6
0
import pandas as pd
import matplotlib.pyplot as plt

from src.utils.utils import utils

df_income = pd.read_csv(
    utils.getDataDir("/pandas/头马助手收入明细2020-03-25至2020-04-24.csv"))
print(df_income)

# key中可能包含空格,可以通过keys()方法输出查看
print("近31天总收入")
print(format(df_income["总收入(元)"].sum(), ".2f"))

print("近31天平均每天收入")
all = format(df_income["总收入(元)"].mean(), ".2f")
print(all)

print("近31天原生视频总收入")
# format产出的是str
原生视频 = format(df_income["原生视频"].sum(), ".2f")
print(原生视频)

print("近31天格子广告总收入")
格子广告 = format(df_income["格子广告"].sum(), ".2f")
print()

print("近31天banner总收入")
banner = format(df_income["banner"].sum(), ".2f")
print(banner)

# series = pd.Series([原生视频,格子广告,banner], index=["原生视频", "格子广告", "banner"], name="总收入")
Esempio n. 7
0
import pytesseract

# https://tesseract-ocr.github.io/tessdoc/Data-Files
# Error opening data file /usr/local/share/tessdata/chi_sim.traineddata
from src.utils.utils import utils

try:
    import Image
except ImportError:
    from PIL import Image

image_path = utils.getDataDir("/images/gushi.png")

# CASE 1
# image = Image.open(image_path)

# 对图片进行阈值过滤,然后保存
# image = image.point(lambda x: 0 if x<143 else 255)
# image.save('./new.png')

# lang 指定中文简体
# text = pytesseract.image_to_string(image, lang='chi_sim')
# print(text)

# CASE 2
# 通过text文件批量处理图片
# text = pytesseract.image_to_string('../../../../data/images/images.txt', lang='chi_sim')
# print(text)

# CASE 3
# 直接通过图片路径处理图片,跳过Image open
Esempio n. 8
0
import pandas as pd

from src.utils.utils import utils

# Read data from CSV file
# pd_csv.csv
# 1,2,3
# 2,3,4
# 7,8,9
df = pd.read_csv(utils.getDataDir("/pandas/pd_csv.csv"))
print(df)
#    1  2  3
# 0  2  3  4
# 1  7  8  9

# Read a large amount of data from CSV file
df = pd.read_csv(utils.getDataDir("/pandas/pandas.csv"))
print(df)

print(df.head(3))

# A check on how pandas interpreted each of the column data types
# When asking for the dtypes, no brackets are used! dtypes is an attribute of a DataFrame and Series.
print(df.dtypes)

# Read data from Excel
# pip3.7 install xlrd if ImportError: Missing optional dependency 'xlrd'.
df = pd.read_excel(utils.getDataDir("/pandas/simple.xlsx"),
                   sheet_name="Sheet1")
print(df)
print(df.info())