import pandas as pd import matplotlib.pyplot as plt from src.utils.utils import utils # header指定用哪一行作列名, index_col指定用作索引的列 # https://blog.csdn.net/weixin_38546295/article/details/83537558 df_pages = pd.read_excel(utils.getDataDir("/pandas/头马助手使用分析_页面分析.xlsx"), header=2, index_col=0) print(df_pages.dtypes) print(df_pages) # 百分比转化为float类型用于排序 df_pages_quit = df_pages["退出率"].str.strip('%').astype(float) / 100 print(df_pages_quit.sort_values(ascending=False))
pb = NSPasteboard.generalPasteboard() pb.clearContents() # TODO 如果传入图片 a = NSArray.arrayWithObject_(text) return pb.writeObjects_(a) # 将bytes数据还原为图片 def image_from_bytes(raw): from_img = Image.frombytes(img.mode, img.size, raw) return from_img if __name__ == '__main__': text = "激战" setClip(text) # bytes字节符,打印以b开头。 # encode() will result in a sequence of bytes. decode() decodes a stream of bytes to a string object # bytes([source[, encoding[, errors]]]): source to initialize the array of bytes. if source is a string, the encoding of the string. print(bytes(get_paste_img_file()).decode("utf-8")) data = get_paste_img_file() writeToFile(data) # 将bytes数据还原为图片 # TODO 如何将剪切板获取的数据转成合适的bytes img = Image.open(utils.getDataDir("/images/gushi.jpeg")) raw = img.tobytes() img_out = image_from_bytes(raw) img_out.show()
import pandas as pd from src.utils.utils import utils # https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html#min-tut-03-subset titanic = pd.read_csv(utils.getDataDir("/pandas/titanic.csv")) # To select a single column, use square brackets print(titanic["Age"]) print(titanic["Age"].max()) print(titanic["Age"].min()) # Have a look at the shape of titanic["Age"] # shape is an attribute of a pandas Series so we don't need to add round brackets print(titanic["Age"].shape) # We can select two columns at the same time, use a list of column names within the selection brackets [] print(titanic[["Age", "Sex"]]) print(titanic[["Age", "Sex"]].shape) # To select rows based on a conditional expression, use a condition inside the selection brackets []. print(titanic[titanic["Age"] > 35]) # Another conditional expression print(titanic[titanic["Pclass"].isin([2, 3])]) # When combining multiple conditional statements, each condition must be surrounded by parentheses (). # Moreover, you can not use or/and but need to use the or operator | and the and operator &. print(titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]) # To select rows if Age is not NA
import pandas as pd from src.utils.utils import utils import matplotlib.pyplot as plt pd.set_option('display.max_columns',200) #设置显示列数 pd.set_option('display.max_rows',10) #设置显示行数 df = pd.read_csv(utils.getDataDir("/pandas/content_search_query2020-04-08.csv")) print(df) # 按照预测行业进行统计 # 'DataFrame' object has no attribute 'value_counts', only 'Series' has. print(df["预测行业"].value_counts()) # 筛选出买家数>0的数据 # print(df[df["买家数"] > 0]) # 筛选出曝光点击率 > 0.3的查询词 df["曝光点击率"] = df["曝光点击率"].str.strip('%').astype(float)/100 # print(df["曝光点击率"]) # print(df[df["曝光点击率"] > 0.3]) # 计算搜索词的平均uv # print(df["搜索uv"].mean()) print(df[df["曝光点击率"] > 0.3]) df[df["曝光点击率"] > 0.3].plot() plt.show() df = df.rename(columns={"搜索uv": "search uv", "搜索pv": "search pv"})
import pandas as pd import matplotlib.pyplot as plt from src.utils.utils import utils # https://pandas.pydata.org/docs/getting_started/intro_tutorials/04_plotting.html # Column(s) to use as the row labels of the DataFrame, either given as string name or column index. # Note: index_col=False can be used to force pandas to not use the first column as the index # index_col=0 define the first (0th) column as index of the resulting DataFrame # parse_dates convert the dates in the column to Timestamp objects, respectively # air_quality = pd.read_csv(utils.getDataDir("/pandas/air_quality_no2.csv"), parse_dates=True) air_quality = pd.read_csv(utils.getDataDir("/pandas/air_quality_no2.csv"), index_col=0, parse_dates=True) # print(air_quality) print(air_quality) print(air_quality.dtypes) # a quick visual check of the data. air_quality.plot() plt.show() # plot only the columns of the data table with the data from Paris. air_quality["station_paris"].plot() plt.show() # Visually compare the 𝑁02 values measured in London versus Paris. # Scatter create a scatter plot(散点图)with varying marker point size and color. air_quality.plot.scatter(x="station_london", y="station_paris", alpha=0.5)
import pandas as pd import matplotlib.pyplot as plt from src.utils.utils import utils df_income = pd.read_csv( utils.getDataDir("/pandas/头马助手收入明细2020-03-25至2020-04-24.csv")) print(df_income) # key中可能包含空格,可以通过keys()方法输出查看 print("近31天总收入") print(format(df_income["总收入(元)"].sum(), ".2f")) print("近31天平均每天收入") all = format(df_income["总收入(元)"].mean(), ".2f") print(all) print("近31天原生视频总收入") # format产出的是str 原生视频 = format(df_income["原生视频"].sum(), ".2f") print(原生视频) print("近31天格子广告总收入") 格子广告 = format(df_income["格子广告"].sum(), ".2f") print() print("近31天banner总收入") banner = format(df_income["banner"].sum(), ".2f") print(banner) # series = pd.Series([原生视频,格子广告,banner], index=["原生视频", "格子广告", "banner"], name="总收入")
import pytesseract # https://tesseract-ocr.github.io/tessdoc/Data-Files # Error opening data file /usr/local/share/tessdata/chi_sim.traineddata from src.utils.utils import utils try: import Image except ImportError: from PIL import Image image_path = utils.getDataDir("/images/gushi.png") # CASE 1 # image = Image.open(image_path) # 对图片进行阈值过滤,然后保存 # image = image.point(lambda x: 0 if x<143 else 255) # image.save('./new.png') # lang 指定中文简体 # text = pytesseract.image_to_string(image, lang='chi_sim') # print(text) # CASE 2 # 通过text文件批量处理图片 # text = pytesseract.image_to_string('../../../../data/images/images.txt', lang='chi_sim') # print(text) # CASE 3 # 直接通过图片路径处理图片,跳过Image open
import pandas as pd from src.utils.utils import utils # Read data from CSV file # pd_csv.csv # 1,2,3 # 2,3,4 # 7,8,9 df = pd.read_csv(utils.getDataDir("/pandas/pd_csv.csv")) print(df) # 1 2 3 # 0 2 3 4 # 1 7 8 9 # Read a large amount of data from CSV file df = pd.read_csv(utils.getDataDir("/pandas/pandas.csv")) print(df) print(df.head(3)) # A check on how pandas interpreted each of the column data types # When asking for the dtypes, no brackets are used! dtypes is an attribute of a DataFrame and Series. print(df.dtypes) # Read data from Excel # pip3.7 install xlrd if ImportError: Missing optional dependency 'xlrd'. df = pd.read_excel(utils.getDataDir("/pandas/simple.xlsx"), sheet_name="Sheet1") print(df) print(df.info())