# Produce some simulated survival data from a weird hazard function import numpy from samplers import HazardSampler # Set a random seed and sample size numpy.random.seed(1) m = 1000 # Use this totally crazy hazard function hazard = lambda t: numpy.exp(numpy.sin(t) - 2.0) # Sample failure times from the hazard function sampler = HazardSampler(hazard) failure_times = numpy.array([sampler.draw() for _ in range(m)]) # Apply some non-informative right censoring, just to demonstrate how it's done censor_times = numpy.random.uniform(0.0, 25.0, size=m) y = numpy.minimum(failure_times, censor_times) c = 1.0 * (censor_times > failure_times) # Make some plots of the simulated data from matplotlib import pyplot from statsmodels.distributions import ECDF # Plot a histogram of failure times from this hazard function pyplot.hist(failure_times, bins=50) pyplot.title('Uncensored Failure Times') pyplot.savefig('uncensored_hist.png', transparent=True) pyplot.show() # Plot a histogram of censored failure times from this hazard function
import time numpy.seterr(all='raise') numpy.random.seed(1) m = 1000 data_filename = 'log_hazard_data' + str(m) + '.pickle' modelfilename = 'log_hazard_model' + str(m) + '.pickle' redo = False if os.path.exists(data_filename) and not redo: with open(data_filename, 'r') as infile: m, y, c, censor_times, failure_times = pickle.load(infile) else: censor_times = numpy.random.uniform(0.0, 100.0, size=m) baseline_hazard = lambda t: numpy.exp(numpy.sin(t) - 2.0) sampler = HazardSampler(baseline_hazard, 10.0, 20.0) failure_times = numpy.array([sampler.draw() for _ in range(m)]) y = numpy.minimum(failure_times, censor_times) c = 1.0 * (censor_times > failure_times) with open(data_filename, 'w') as outfile: pickle.dump((m, y, c, censor_times, failure_times), outfile) pyplot.hist(y, bins=50) pyplot.show() t0 = time.time() model = GeneralizedRegressor(base_regressor=Earth(thresh=1e-7, max_terms=100, smooth=True, allow_linear=False, penalty=0), loss_function=MidpointLogHazardLossFunction(10)) model.fit(X=None,y=y,c=c) with open(modelfilename, 'w') as outfile: pickle.dump(model, outfile) t1 = time.time() print 'Total fitting time: %f seconds' % (t1 - t0) t = numpy.arange(0.0, 30.0, .1)