def execute(): data = read_data( "sqlite:///../data/isrid-master.db", Subject.age, Subject.sex, Incident.search_hours, Subject.survived ) data = data[data.search_hours > 0] time_max = 1000 time_ticks = np.linspace(0, time_max, time_max + 1)[:, None] data = data[(21 <= data.age) & (data.age < 30) & (data.sex == 1)] # Preprocessing # The anomaly detection is slow and memory-intensive # There is probably a smarter way to calculate the distances times = data["search_hours"].as_matrix() lof = local_outlier_factors(times) indices, threshold = [], 100 # Threshold is arbitrary # There is an extension to map the LOF on a # closed range from 0 to 1 for index, time in enumerate(times): if time > time_max: p = lof(index) if p > threshold: print("t = {:.3f} h, LOF = {:.3f}".format(time, p)) indices.append(index) data.drop(data.index[indices], inplace=True) # Fitting lines = [] for cutoff in [1000, float("inf")]: label = "Cutoff at {} h".format(cutoff) alpha, beta_samples = fit(data, label, data.search_hours < cutoff) beta_mean = np.mean(beta_samples) data_ = data.copy() predictions = sigmoid(data_["search_hours"], alpha, beta_mean) bs = brier_score(data_["survived"], predictions) print(" BS = {:.3f}".format(bs)) y = sigmoid(time_ticks.T, alpha, beta_samples) line, *_ = plt.plot(time_ticks, y.mean(axis=0), label=label + " ({:.3f})".format(bs), alpha=0.8) lines.append(line) quantiles = mquantiles(y, [0.025, 0.975], axis=0) plt.fill_between(time_ticks[:, 0], *quantiles, alpha=0.6, color=line.get_color()) # Empirical Distribution start, end, width = 0, time_max, time_max // 10 probabilities = [] for left in range(start, end, width): right = left + width cases = data[(data.search_hours >= left) & (data.search_hours < right)] print(len(cases)) if len(cases) > 0: probabilities.append(sum(cases["survived"]) / len(cases)) else: probabilities.append(float("nan")) plt.scatter([x + width / 2 for x in range(start, end, width)], probabilities) plt.legend(handles=lines) plt.title("Survival Curves Over Time (Male Subjects, 21 - 30 Years Old)") plt.xlabel("Search Duration (hours)") plt.ylabel("Probability of Survival") plt.xlim(0, time_max) plt.ylim(0, 1) plt.grid(True) # plt.savefig('../doc/figures/survival-curves-male-empirical.svg', # transparent=True) plt.show()
for column in range(3): category = categories[row*3 + column] ax = axes[row, column] print(category) subquery = query.filter(func.upper(Group.category) == category) subquery = subquery.filter(Incident.total_hours > 0) subquery = subquery.order_by(Incident.total_hours.asc()) times = subquery.from_self(Incident.total_hours).all() times = list(map(lambda t: t[0].total_seconds()/3600, times)) survivals = subquery.from_self(Subject.survived).all() survivals = [survival for survival, *_ in survivals] indices = [] lof = local_outlier_factors(times) for index, time in enumerate(times): if time > 21*24: indices.append(index) elif lof(index) > 100: indices.append(index) print('Removing {} outliers'.format(len(indices))) times = [time for index, time in enumerate(times) if index not in indices] survivals = [survival for index, survival in enumerate(survivals) if index not in indices] alpha = np.log(1/0.99 - 1) beta = pm.Beta('beta', 1, 2, 1e-3) prob = pm.Lambda('prob', lambda t=times, a=alpha, b=beta: