# Plot the data and curve fit fig, ax = plt.subplots(1,1) ax.plot(sorted_data, cprob, c='0.75', alpha=0.75, linestyle='-', linewidth=2.) ax.set_ylabel('Cumulative probability') ax.set_ylim((0, 1.)) if data_label: ax.set_xlabel(data_label) return ax.figure if __name__ == '__main__': train = pd.read_csv('./data/train.csv', index_col='Id') #test = pd.read_csv('./data/test.csv', index_col='Id') train = encode_categorical(train) #plot joint plots of each variable with Hazard hazard_jointplot(train) # Usage - to show only high hazard plots: #threshold = math.exp(3) #hazard_jointplot(train, './plots/high/', threshold=threshold) # plot a histogram of the hazard ax = sns.distplot(train.Hazard, fit=expon, kde=False) ax.figure.savefig('./plots/hazard_hist.png') #plot the cumulative distibution & fit exponential curve g = plot_exp_fit(train.Hazard) g.savefig('./plots/hazard_exp_fit.png')
import xgboost as xgb from gini import Gini from preprocess import encode_categorical # function used to binarize the hazards def binarizer(x, threshold): return int(x > threshold) train = pd.read_csv('./data/train.csv', index_col='Id') test = pd.read_csv('./data/test.csv', index_col='Id') columns = train.drop(['Hazard'], axis=1).columns # encode categorical variables as numbers train = encode_categorical(train) test = encode_categorical(test) code_id = 105 # train a random forest n = 500 n_split = 2 low = np.arange(2,71) #high = np.arange(35,75,5) haz_bins = low #np.concatenate((low, high)) #weights = np.concatenate((np.tile([1], len(low)), np.tile([5], len(high)))) weights = np.tile([1], len(low)) haz_pred_i = np.zeros((test.shape[0], haz_bins.shape[0]))