コード例 #1
0
ファイル: plot.py プロジェクト: jgeiman/DataScienceIntensive
    # Plot the data and curve fit    
    fig, ax = plt.subplots(1,1)
    ax.plot(sorted_data, cprob, c='0.75', alpha=0.75, linestyle='-', 
                linewidth=2.)
    ax.set_ylabel('Cumulative probability')
    ax.set_ylim((0, 1.))
    if data_label:
        ax.set_xlabel(data_label)
    return ax.figure
    

if __name__ == '__main__':
    train = pd.read_csv('./data/train.csv', index_col='Id')
    #test = pd.read_csv('./data/test.csv', index_col='Id')

    train = encode_categorical(train)

    #plot joint plots of each variable with Hazard
    hazard_jointplot(train)

    # Usage - to show only high hazard plots:
    #threshold = math.exp(3)
    #hazard_jointplot(train, './plots/high/', threshold=threshold)

    # plot a histogram of the hazard
    ax = sns.distplot(train.Hazard, fit=expon, kde=False)     
    ax.figure.savefig('./plots/hazard_hist.png')
    
    #plot the cumulative distibution & fit exponential curve
    g = plot_exp_fit(train.Hazard)
    g.savefig('./plots/hazard_exp_fit.png')
コード例 #2
0
import xgboost as xgb

from gini import Gini
from preprocess import encode_categorical

# function used to binarize the hazards
def binarizer(x, threshold):
    return int(x > threshold)
     
train = pd.read_csv('./data/train.csv', index_col='Id')
test = pd.read_csv('./data/test.csv', index_col='Id')

columns = train.drop(['Hazard'], axis=1).columns

# encode categorical variables as numbers
train = encode_categorical(train)
test = encode_categorical(test)

code_id = 105

# train a random forest
n = 500
n_split = 2

low = np.arange(2,71)
#high = np.arange(35,75,5)
haz_bins = low #np.concatenate((low, high))
#weights = np.concatenate((np.tile([1], len(low)), np.tile([5], len(high))))
weights = np.tile([1], len(low))

haz_pred_i = np.zeros((test.shape[0], haz_bins.shape[0]))