def read_file():
    file_content = pd.read_csv('train.csv')
    exc_cols = [u'Id', u'Response']
    cols = [c for c in file_content.columns if c not in exc_cols]
    train_datas = file_content.ix[:, cols]
    train_lables = file_content['Response'].values
    
    test_file = pd.read_csv('test.csv')
    test_ids = test_file['Id'].values
    test_datas = test_file.ix[:, [c for c in test_file.columns if c not in [u'Id']]]
    
    # 填充平均值
    test_datas = test_datas.fillna(-1)
    train_datas = train_datas.fillna(-1)
    all_datas = pd.concat([train_datas, test_datas], axis=0) 
    
    # 对数据进行一下划分
    categoricalVariables = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6", "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1", "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7", "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7", "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3", "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8", "Medical_History_9", "Medical_History_10", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14", "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20", "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26", "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31", "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37", "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"]
    all_file_data = all_datas.ix[:, [c for c in all_datas.columns if c not in categoricalVariables]]
    all_file_cate = all_datas.ix[:, [c for c in categoricalVariables]]
 
    # 归一化 对数值数据
    scalar_this = StandardScaler()
    scalar_this.fit_transform(all_file_data)
    
    # 重新组合数据
    train_datas = pd.concat([all_file_data[:train_datas.shape[0]], all_file_cate[:train_datas.shape[0]]], axis=1)
    test_datas = pd.concat([all_file_data[file_content.shape[0]:], all_file_cate[file_content.shape[0]:]], axis=1)
    
    # 向量化
    train_datas = DictVectorizer().fit_transform(train_datas.to_dict(outtype='records')).toarray()
    test_datas = DictVectorizer().fit_transform(test_datas.to_dict(outtype='records')).toarray()
    
    return (train_datas, train_lables, test_ids, test_datas)
Exemple #2
0
"""

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction import DictVectorizer

# データを読み込む
df=pd.read_csv("C:\\Users\\fukazu\\Documents\\IPython Notebooks\\deepanAlytics\\train.csv",header=None,nrows=10000)

# データクリーニング
# NaNが一つでも入っているrowを除く
df = df[pd.notnull(df).all(1)]

# 説明変数x、目的変数yに分ける
x = df.loc[:, 2:]
y = df[1]

# カテゴリカル変数は文字列に直す
x.loc[:, 4:9] = x.loc[:, 4:9].astype(str)

# カテゴリカル変数を数量化
x = DictVectorizer(sparse=False).fit_transform(x.to_dict('records'))

# SVCで最初の5000個を学習
clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=5, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False)
clf.fit(x[:5000], y[:5000])

# 5000番目以降に対する学習スコア
print clf.score(x[5000:], y[5000:])