def fearture_engineering(): """ 数据进行缺失值填充 字典特抽取 :return: (x_train,x_test, y_train, y_test) """ # 获取数据 titan = pd.read_scv( "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt") # 处理数据,找出特征值和目标值 x = titan[['pclass', 'age', 'sex']] y = titan['survived'] # 使用平均值,填充缺失值 x['age'].fillna(x['age'].mean(), inplace=True) # 分割数据到训练集 测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # 进行特征工程 特征 》》类别 》 one_hot编码 dict = DictVectorizer(sparse=Flase) # pd转换为字典 x_train = dict.fit_transform(x_train.to_dict(orient="records")) x_test = dict.fit_transform(x_test.to_dict(orient="records")) #查看数据 print(dict.get_feature_names()) print(x_train) return (x_train, x_test, y_train, y_test)
def decision(): """ 决策树对泰坦尼克号进行预测生死 :return:None """ #获取数据 titan = pd.read_scv( "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt") #处理数据,找出特征值和目标值 x = titan[['pclass', 'age', 'sex']] y = titan['survived'] #使用平均值,填充缺失值 x['age'].fillna(x['age'].mean(), inplace=True) #分割数据到训练集 测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) #进行特征工程 特征 》》类别 》 one_hot编码 dict = DictVectorizer(sparse=Flase) #pd转换为字典 x_train = dict.fit_transform(x_train.to_dict(orient="records")) print(dict.get_feature_names()) x_test = dict.fit_transform(x_test.to_dict(orient="records")) print(x_train) #用决策树进行预测 dec = DecisionTreeClassifier() dec.fit(x_train, y_train) #预测准确率 print("预测的准确率:", dec.score(x_test, t_test)) #导出决策树的结构 exportgraphviz(dec, out_file="./tree.dot", feature_name=["age", 'pclass'])
def test_datadistribution(): data = pd.read_scv('../../documentation/data/DescriptorsDataset.csv') data_distribution.datadistribution(data['e_gap_alpha']) xname = plt.gca().get_xlabel() yname = plt.gca().get_ylabel() title = plt.gca().get_title() assert title == 'The Distribution of Bandgap', 'Error: The title of figure is wrong' assert yname == '$Counts$', 'Error: The y label is not Counts' assert xname == '$<E_g> \\ [eV]$', 'Error: The x label is not right' return
#### IMPORT THE DATA SETS #### ## I. SUMMARY OF COVID-19 CASES IN ONTARIO SCHOOLS url = "https://data.ontario.ca/dataset/b1fef838-8784-4338-8ef9-ae7cfd405b41/resource/7fbdbb48-d074-45d9-93cb-f7de58950418/download/schoolcovidsummary.csv" df_summary = pd.read_csv(url) #### Change the reported_date column to a datetime object and drop collected_date df_summary["reported_date"] = pd.to_datetime(df_summary["reported_date"]) df_summary.drop("collected_date", axis=1, inplace=True) #-------------------------------------------------------------------------------# ## II. ONTARIO SCHOOLS with COVID-19 CASES by DATE url = "https://data.ontario.ca/dataset/b1fef838-8784-4338-8ef9-ae7cfd405b41/resource/8b6d22e2-7065-4b0f-966f-02640be366f2/download/schoolsactivecovid.csv" try: df_active = pd.read_scv(url, encoding='latin-1') except: df_active = pd.read_csv(url) ### Change 'reported_date' to date time and drop collected_date df_active["reported_date"] = pd.to_datetime(df_active["reported_date"]) df_active.drop("collected_date", axis=1, inplace=True) ## Change the name of column 'school', to 'School' and 'total_confirmed_cases' to 'Active Cases' df_active = df_active.rename( { "school": "School", "total_confirmed_cases": "Active Cases" }, axis=1) #### CLEAN UP THE ACTIVE DATA SET #####
import numpy as np import pandas as pd import matplotlib.pyplot as plt from pathlib import Path #data_folder = Path("Documents/corona") df = pd.read_scv('Documents/corona/coordinates.csv') pd.head() BBox = ((df.longitude.min(),df.longitude.max(), df.latitude.min(), df.latitude.max())