Exemple #1
0
def fearture_engineering():
    """
    数据进行缺失值填充
    字典特抽取
    :return: (x_train,x_test, y_train, y_test)
    """
    # 获取数据
    titan = pd.read_scv(
        "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
    # 处理数据,找出特征值和目标值
    x = titan[['pclass', 'age', 'sex']]
    y = titan['survived']

    # 使用平均值,填充缺失值
    x['age'].fillna(x['age'].mean(), inplace=True)

    # 分割数据到训练集 测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    # 进行特征工程 特征 》》类别  》 one_hot编码
    dict = DictVectorizer(sparse=Flase)
    # pd转换为字典
    x_train = dict.fit_transform(x_train.to_dict(orient="records"))
    x_test = dict.fit_transform(x_test.to_dict(orient="records"))
    #查看数据
    print(dict.get_feature_names())
    print(x_train)
    return (x_train, x_test, y_train, y_test)
Exemple #2
0
def decision():
    """
    决策树对泰坦尼克号进行预测生死
    :return:None
    """
    #获取数据
    titan = pd.read_scv(
        "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
    #处理数据,找出特征值和目标值
    x = titan[['pclass', 'age', 'sex']]
    y = titan['survived']

    #使用平均值,填充缺失值
    x['age'].fillna(x['age'].mean(), inplace=True)

    #分割数据到训练集 测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    #进行特征工程 特征 》》类别  》 one_hot编码
    dict = DictVectorizer(sparse=Flase)
    #pd转换为字典
    x_train = dict.fit_transform(x_train.to_dict(orient="records"))
    print(dict.get_feature_names())
    x_test = dict.fit_transform(x_test.to_dict(orient="records"))
    print(x_train)

    #用决策树进行预测
    dec = DecisionTreeClassifier()
    dec.fit(x_train, y_train)
    #预测准确率
    print("预测的准确率:", dec.score(x_test, t_test))

    #导出决策树的结构
    exportgraphviz(dec, out_file="./tree.dot", feature_name=["age", 'pclass'])
def test_datadistribution():
    data = pd.read_scv('../../documentation/data/DescriptorsDataset.csv')
    data_distribution.datadistribution(data['e_gap_alpha'])
    xname = plt.gca().get_xlabel()
    yname = plt.gca().get_ylabel()
    title = plt.gca().get_title()
    assert title == 'The Distribution of Bandgap', 'Error: The title of figure is wrong'
    assert yname == '$Counts$', 'Error: The y label is not Counts'
    assert xname == '$<E_g> \\ [eV]$', 'Error: The x label is not right'
    return
Exemple #4
0
#### IMPORT THE DATA SETS ####

## I. SUMMARY OF COVID-19 CASES IN ONTARIO SCHOOLS
url = "https://data.ontario.ca/dataset/b1fef838-8784-4338-8ef9-ae7cfd405b41/resource/7fbdbb48-d074-45d9-93cb-f7de58950418/download/schoolcovidsummary.csv"
df_summary = pd.read_csv(url)

#### Change the reported_date column to a datetime object and drop collected_date
df_summary["reported_date"] = pd.to_datetime(df_summary["reported_date"])
df_summary.drop("collected_date", axis=1, inplace=True)

#-------------------------------------------------------------------------------#

## II. ONTARIO SCHOOLS with COVID-19 CASES by DATE
url = "https://data.ontario.ca/dataset/b1fef838-8784-4338-8ef9-ae7cfd405b41/resource/8b6d22e2-7065-4b0f-966f-02640be366f2/download/schoolsactivecovid.csv"
try:
    df_active = pd.read_scv(url, encoding='latin-1')
except:
    df_active = pd.read_csv(url)

### Change 'reported_date' to date time and drop collected_date
df_active["reported_date"] = pd.to_datetime(df_active["reported_date"])
df_active.drop("collected_date", axis=1, inplace=True)

## Change the name of column 'school', to 'School' and 'total_confirmed_cases' to 'Active Cases'
df_active = df_active.rename(
    {
        "school": "School",
        "total_confirmed_cases": "Active Cases"
    }, axis=1)

#### CLEAN UP THE ACTIVE DATA SET #####
Exemple #5
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

#data_folder = Path("Documents/corona")
df = pd.read_scv('Documents/corona/coordinates.csv')
pd.head()

BBox = ((df.longitude.min(),df.longitude.max(), df.latitude.min(), df.latitude.max())