def read_time_data(url): """ Read the time of day each incident occurred at. Arguments: url: A string representing the path to the database. Returns: A `pandas` dataframe with two columns: `time` and `hour`. `time` contains Python `datetime.time` objects with times at midnight filtered out (most of these indicate a date was available, but not time). `hour` is `time` in hours (a float between 0 and 24, exclusive). `time` is derived from `Incident.datetime`. """ engine, session = database.initialize(url) df = tabulate(session.query(Incident.datetime)) database.terminate(engine, session) df = df.assign(time=[datetime.time() for datetime in df.datetime]) df = df[df.time != datetime.time(0)] df = df.assign(hour=[time.hour + time.minute/60 + time.second/3600 for time in df.time]) return df
def test_tabulation(self): query = self.session.query(Subject.survived, Subject.age) df = tabulate(query) self.assertEqual(list(df.columns), ['survived', 'age']) self.assertEqual(len(df), self.session.query(Subject).count()) for columns in df.itertuples(False): for value in columns: self.assertIsNotNone(value)
def read_data(url): engine, session = database.initialize(url) query = session.query(Incident.total_hours, Subject.survived, Group.category).join(Group, Subject) df = tabulate(query) database.terminate(engine, session) return df
def read_time_data(url): engine, session = database.initialize(url) df = tabulate(session.query(Incident.datetime)) database.terminate(engine, session) df = df.assign(time=[datetime.time() for datetime in df.datetime]) df = df[df.time != datetime.time(0)] df = df.assign(hour=[time.hour + time.minute/60 + time.second/3600 for time in df.time]) return df
def read_simple_data(url, exclude_singles=False, exclude_groups=False): """ Read incident duration, survival, and category data. A useful shorthand. Arguments: url: A string representing the database URL to connect to. exclude_singles: A boolean indicating whether the query should exclude subjects from groups with exactly one member. exclude_groups: A boolean indicating whether the query should exclude subjects from groups with more than one members. Returns: A pandas dataframe containing the lost person data. The columns include `total_hours`, `survived`, `category`, `days` (the incident duration in days, as taken from `total_hours`), and `doa` (a boolean that is `True` is the subject did not survive). Cases with a negative timedelta `Incident.total_hours` are filtered out. Warning: If `exclude_singles` is `True` or `exclude_groups` is `True`, the function also needs to query the size of each `Group`, which may take a while (perhaps a minute). """ engine, session = database.initialize(url) columns = Incident.total_hours, Subject.survived, Group.category, Group.id query = session.query(*columns).join(Group, Subject) df = tabulate(query) database.terminate(engine, session) if exclude_singles or exclude_groups: df['size'] = [Group.query.get(int(id_)).size for id_ in df.id] # Hack if exclude_singles: df = df[df['size'] > 1] if exclude_groups: df = df[df['size'] == 1] if 'size' in df: df.drop('size', 1, inplace=True) df.drop('id', 1, inplace=True) df = df.assign(days=[total_hours.total_seconds()/3600/24 for total_hours in df.total_hours], doa=[not survived for survived in df.survived]) df = df[0 <= df.days] return df
#!/usr/bin/env python3 import matplotlib.pyplot as plt import numpy as np import Orange from pomegranate import * import database from database.models import Subject, Group, Incident, Weather from database.processing import survival_rate, tabulate, export_as_orange engine, session = database.initialize('sqlite:///../data/isrid-master.db') query = session.query(Subject.survived, Group.size, Weather.avg_temp) query = query.join(Group, Incident, Weather) df = tabulate(query, [True, True, True]) database.terminate(engine, session) print(sum(df.survived)/len(df))
import pandas as pd import database from database.models import Incident, Group, Subject from database.processing import tabulate # Get data # Path may vary based on your current working directory engine, session = database.initialize('sqlite:///../../data/isrid-master.db') query = session.query(Subject.survived, Incident.total_hours, Group.category, Group.id, Subject.age, Subject.sex) query = query.join(Group, Incident) df = tabulate(query) df['size'] = [Group.query.get(int(id)).size for id in df.id] # Bad hack df['days'] = [hours.total_seconds()/3600/24 for hours in df.total_hours] database.terminate(engine, session) # Build UI plot = figure(y_range=Range1d(bounds='auto', start=0, end=1 + 1e-3), plot_width=1000, title='Lost Person Survival Over Time') status = Paragraph() # Used for notifying the user when the constraints # exclude all cases, rather than failing silently. # Create a list of checkboxes for enabling each category
def execute(): matplotlib.rc("font", size=20) engine, session = database.initialize("sqlite:///../data/isrid-master.db") # Query with Group.size may take awhile, at least for Charles # Not sure why query = session.query(Incident.total_hours, Subject.survived, Group.category, Group.size).join(Group, Subject) print("Tabulating query... may take awhile for unknown reasons.") df = tabulate(query) print("Done tabulating.") print(df.describe()) database.terminate(engine, session) df = df.assign( days=[total_hours.total_seconds() / 3600 / 24 for total_hours in df.total_hours], doa=[not survived for survived in df.survived], ) df = df[0 <= df.days] rows, columns = 2, 2 grid, axes = plt.subplots(rows, columns, figsize=(15, 10)) categories = Counter(df.category) plot = 0 kmfs = [] options = {"show_censors": True, "censor_styles": {"marker": "|", "ms": 6}, "censor_ci_force_lines": False} for category, count in categories.most_common()[: rows * columns]: print("Category:", category) ax = axes[plot // columns, plot % columns] df_ = df[df.category == category] N, Ndoa = len(df_), sum(df_.doa) Srate = 100 * (1 - Ndoa / N) grp = df_[df_.size > 1] sng = df_[df_.size == 1] kmf = KaplanMeierFitter() # kmf.fit(df_.days, event_observed=df_.doa, label=category) # kmf.plot(ax=ax, ci_force_lines=True) kmf.fit(grp.days, event_observed=grp.doa, label=category + " Groups") kmf.plot(ax=ax, **options) kmf.fit(sng.days, event_observed=sng.doa, label=category + " Singles") kmf.plot(ax=ax, **options) kmfs.append(kmf) ax.set_xlim(0, min(30, 1.05 * ax.get_xlim()[1])) ax.set_ylim(0, 1) ax.set_title("{}, N = {}, DOA = {}, {:.0f}% surv".format(category, N, Ndoa, Srate)) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") # ax.legend_.remove() # ax.grid(True) plot += 1 grid.suptitle("Kaplan-Meier Survival Curves", fontsize=25) grid.tight_layout() grid.subplots_adjust(top=0.9) grid.savefig("../doc/figures/kaplan-meier/km-grid-large.svg", transparent=True) combined = plt.figure(figsize=(15, 10)) ax = combined.add_subplot(1, 1, 1) for kmf in kmfs[: rows * columns]: kmf.plot(ci_show=False, show_censors=True, censor_styles={"marker": "|", "ms": 6}, ax=ax) ax.set_xlim(0, 15) ax.set_ylim(0, 1) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") ax.set_title("Kaplan-Meier Survival Curves", fontsize=25) ax.grid(True) combined.savefig("../doc/figures/kaplan-meier/km-combined-large.svg", transparent=True) plt.show()
def main(): """ Plot the profile (size and survival rate) of the most common categories. """ ## Read data engine, session = database.initialize('sqlite:///../data/isrid-master.db') query = session.query(Subject.age, Group.category, Subject.survived) query = query.join(Group) df = tabulate(query) database.terminate(engine, session) ## Process subjects by category and age selected_categories = df.category.value_counts()[:10].index.tolist() df = df[df.category.isin(selected_categories)] age_bins = np.linspace(0, 100, 11) survival_rates = np.full((10, 10), np.nan, dtype=np.float64) subgroup_sizes = np.full((10, 10), 0, dtype=np.float64) min_subgroup_size = 10 for category, group in df.groupby('category'): group.insert(len(group.columns), 'age_bin', np.digitize(group.age, age_bins)) for age_index, subgroup in group.groupby('age_bin'): survivals = subgroup.survived.values.tolist() key = age_index - 1, selected_categories.index(category) if len(survivals) > min_subgroup_size: survival_rates[key] = sum(survivals)/len(survivals) subgroup_sizes[key] = len(survivals) # Debugging lower, upper = age_bins[age_index - 1], age_bins[age_index] print('{}, {} - {} years old'.format(category, int(lower), int(upper))) print(' Survival rate: {:.3f}%'.format( 100*survival_rates[key])) print(' Number of subjects: {}'.format( int(subgroup_sizes[key]))) ## Plot survival rates and subgroup sizes canvas = plt.matshow(survival_rates, fignum=False, cmap='RdYlGn', origin='lower') colorbar = plt.colorbar(canvas) colorbar.solids.set_edgecolor('face') colorbar.set_label('Survival Rate') x_positions = y_positions = np.arange(0, 10) for x in x_positions: for y in y_positions: plt.text(x, y, int(subgroup_sizes[y, x]) or '', horizontalalignment='center', verticalalignment='center') plt.title('Lost Person Category Profiles') plt.ylabel('Age (years)') plt.xlabel('Category') ax = plt.gca() ax.xaxis.tick_bottom() plt.yticks(np.linspace(0, 10, 11) - 0.5, age_bins.astype(np.int)) plt.xticks(x_positions, selected_categories, rotation=60) plt.subplots_adjust(bottom=0.2) plt.tight_layout() plt.savefig('../doc/figures/subject-data/category-profiles.svg', transparent=True) plt.show()
""" plots -- Plots of subject data """ import matplotlib.pyplot as plt import database from database.models import Subject from database.processing import tabulate ## Fetch data engine, session = database.initialize('sqlite:///../data/isrid-master.db') query = session.query(Subject.age, Subject.weight, Subject.height) query = query.filter(Subject.age != None) df = tabulate(query, not_null=False) database.terminate(engine, session) ## Make weight vs. age plot color = '#177788' df_filtered = df[df.weight.notnull()] plt.figure(1) plt.scatter(df_filtered.age, df_filtered.weight, c=color, alpha=0.5) plt.xlim(0, df_filtered.age.max() + 5) plt.ylim(0, df_filtered.weight.max() + 5) plt.title('Weight vs. Age') plt.xlabel('Age (year)') plt.ylabel('Weight (kg)') plt.tight_layout() plt.savefig('../doc/figures/subject-data/weight-vs-age-plot.svg',