Beispiel #1
0
def execute():
    data = read_data(
        "sqlite:///../data/isrid-master.db", Subject.age, Subject.sex, Incident.search_hours, Subject.survived
    )
    data = data[data.search_hours > 0]

    time_max = 1000
    time_ticks = np.linspace(0, time_max, time_max + 1)[:, None]
    data = data[(21 <= data.age) & (data.age < 30) & (data.sex == 1)]

    # Preprocessing
    # The anomaly detection is slow and memory-intensive
    # There is probably a smarter way to calculate the distances
    times = data["search_hours"].as_matrix()
    lof = local_outlier_factors(times)

    indices, threshold = [], 100  # Threshold is arbitrary
    # There is an extension to map the LOF on a
    # closed range from 0 to 1
    for index, time in enumerate(times):
        if time > time_max:
            p = lof(index)
            if p > threshold:
                print("t = {:.3f} h, LOF = {:.3f}".format(time, p))
                indices.append(index)

    data.drop(data.index[indices], inplace=True)

    # Fitting
    lines = []
    for cutoff in [1000, float("inf")]:
        label = "Cutoff at {} h".format(cutoff)
        alpha, beta_samples = fit(data, label, data.search_hours < cutoff)
        beta_mean = np.mean(beta_samples)

        data_ = data.copy()
        predictions = sigmoid(data_["search_hours"], alpha, beta_mean)
        bs = brier_score(data_["survived"], predictions)
        print("  BS = {:.3f}".format(bs))

        y = sigmoid(time_ticks.T, alpha, beta_samples)
        line, *_ = plt.plot(time_ticks, y.mean(axis=0), label=label + " ({:.3f})".format(bs), alpha=0.8)
        lines.append(line)

        quantiles = mquantiles(y, [0.025, 0.975], axis=0)
        plt.fill_between(time_ticks[:, 0], *quantiles, alpha=0.6, color=line.get_color())

    # Empirical Distribution
    start, end, width = 0, time_max, time_max // 10
    probabilities = []
    for left in range(start, end, width):
        right = left + width
        cases = data[(data.search_hours >= left) & (data.search_hours < right)]

        print(len(cases))
        if len(cases) > 0:
            probabilities.append(sum(cases["survived"]) / len(cases))
        else:
            probabilities.append(float("nan"))

    plt.scatter([x + width / 2 for x in range(start, end, width)], probabilities)

    plt.legend(handles=lines)
    plt.title("Survival Curves Over Time (Male Subjects, 21 - 30 Years Old)")
    plt.xlabel("Search Duration (hours)")
    plt.ylabel("Probability of Survival")
    plt.xlim(0, time_max)
    plt.ylim(0, 1)
    plt.grid(True)
    # plt.savefig('../doc/figures/survival-curves-male-empirical.svg',
    #             transparent=True)
    plt.show()
    for column in range(3):
        category = categories[row*3 + column]
        ax = axes[row, column]
        print(category)

        subquery = query.filter(func.upper(Group.category) == category)
        subquery = subquery.filter(Incident.total_hours > 0)
        subquery = subquery.order_by(Incident.total_hours.asc())

        times = subquery.from_self(Incident.total_hours).all()
        times = list(map(lambda t: t[0].total_seconds()/3600, times))
        survivals = subquery.from_self(Subject.survived).all()
        survivals = [survival for survival, *_ in survivals]

        indices = []
        lof = local_outlier_factors(times)
        for index, time in enumerate(times):
            if time > 21*24:
                indices.append(index)
            elif lof(index) > 100:
                indices.append(index)

        print('Removing {} outliers'.format(len(indices)))
        times = [time for index, time in enumerate(times)
                 if index not in indices]
        survivals = [survival for index, survival in enumerate(survivals)
                     if index not in indices]

        alpha = np.log(1/0.99 - 1)
        beta = pm.Beta('beta', 1, 2, 1e-3)
        prob = pm.Lambda('prob', lambda t=times, a=alpha, b=beta: