def main(): """ Bootstrap the update process by wrapping the initialization and termination of logging and database access. Errors raised by tasks are caught here and logged, and the script is immediately killed. """ initialize_logging('../logs/update.log') logger = logging.getLogger() engine, session = database.initialize('sqlite:///../data/isrid-master.db') tasks = [augment_weather_instances] for task in tasks: try: task_name = task.__name__.replace('_', ' ') logger.info('Starting task: {}'.format(task_name)) task(session) except KeyboardInterrupt: print() logger.info('Terminating update ... ') break except Exception as error: logger.error('{}: {}'.format(type(error).__name__, error)) break logging.shutdown() # Flush files database.terminate(engine, session)
def execute(): warnings.filterwarnings("ignore") initialize_logging("../logs/merge.log", "a+") logger = logging.getLogger() engine, session = database.initialize("sqlite:///../data/isrid-master.db") with open("../data/mappings.yaml") as mappings_file: mappings = yaml.load(mappings_file.read()) for filename in os.listdir("../data/"): if filename.endswith(".xlsx"): for title, rows in read_excel(os.path.join("../data/", filename)): procedure = Registry.retrieve(filename, title) procedure = procedure or Registry.retrieve(filename) mapping = mappings.get(filename, {}).get(title, {}) if procedure: message = "Merging '{}' from '{}' ... " logger.info(message.format(title, filename)) labels = list(next(rows)) if labels.count("Equipment4") > 1: index = labels[::-1].index("Equipment4") labels[-index - 1] = "Equipment5" for index, row in enumerate(rows): labeled_row = dict(zip(labels, row)) for model in procedure(index, labeled_row, mapping): session.add(model) session.commit() logging.shutdown() database.terminate(engine, session)
def read_time_data(url): """ Read the time of day each incident occurred at. Arguments: url: A string representing the path to the database. Returns: A `pandas` dataframe with two columns: `time` and `hour`. `time` contains Python `datetime.time` objects with times at midnight filtered out (most of these indicate a date was available, but not time). `hour` is `time` in hours (a float between 0 and 24, exclusive). `time` is derived from `Incident.datetime`. """ engine, session = database.initialize(url) df = tabulate(session.query(Incident.datetime)) database.terminate(engine, session) df = df.assign(time=[datetime.time() for datetime in df.datetime]) df = df[df.time != datetime.time(0)] df = df.assign(hour=[time.hour + time.minute/60 + time.second/3600 for time in df.time]) return df
def read_data(url): engine, session = database.initialize(url) query = session.query(Incident.total_hours, Subject.survived, Group.category).join(Group, Subject) df = tabulate(query) database.terminate(engine, session) return df
def read_time_data(url): engine, session = database.initialize(url) df = tabulate(session.query(Incident.datetime)) database.terminate(engine, session) df = df.assign(time=[datetime.time() for datetime in df.datetime]) df = df[df.time != datetime.time(0)] df = df.assign(hour=[time.hour + time.minute/60 + time.second/3600 for time in df.time]) return df
def read_simple_data(url, exclude_singles=False, exclude_groups=False): """ Read incident duration, survival, and category data. A useful shorthand. Arguments: url: A string representing the database URL to connect to. exclude_singles: A boolean indicating whether the query should exclude subjects from groups with exactly one member. exclude_groups: A boolean indicating whether the query should exclude subjects from groups with more than one members. Returns: A pandas dataframe containing the lost person data. The columns include `total_hours`, `survived`, `category`, `days` (the incident duration in days, as taken from `total_hours`), and `doa` (a boolean that is `True` is the subject did not survive). Cases with a negative timedelta `Incident.total_hours` are filtered out. Warning: If `exclude_singles` is `True` or `exclude_groups` is `True`, the function also needs to query the size of each `Group`, which may take a while (perhaps a minute). """ engine, session = database.initialize(url) columns = Incident.total_hours, Subject.survived, Group.category, Group.id query = session.query(*columns).join(Group, Subject) df = tabulate(query) database.terminate(engine, session) if exclude_singles or exclude_groups: df['size'] = [Group.query.get(int(id_)).size for id_ in df.id] # Hack if exclude_singles: df = df[df['size'] > 1] if exclude_groups: df = df[df['size'] == 1] if 'size' in df: df.drop('size', 1, inplace=True) df.drop('id', 1, inplace=True) df = df.assign(days=[total_hours.total_seconds()/3600/24 for total_hours in df.total_hours], doa=[not survived for survived in df.survived]) df = df[0 <= df.days] return df
def read_data(url, *columns, not_null=True): engine, session = database.initialize(url) query = session.query(*columns).join(Group, Incident) query = query.filter(*map(lambda column: column != None, columns)) database.terminate(engine, session) data = pd.DataFrame() for column in columns: name, datatype = str(column).split(".")[-1], column.type.python_type values = (value for value, *empty in query.from_self(column)) if datatype == datetime.timedelta: datatype = float values = map(lambda value: value.total_seconds() / 3600, values) data[name] = np.fromiter(values, np.dtype(datatype)) return data
def loop(): """ Read and evaluate expressions provided by the user. """ engine, session = database.initialize('sqlite:///../data/isrid-master.db') print('Shell initialized at: {}'.format(datetime.datetime.now())) cmd = 1 # You can change your prompt to include the command number while True: try: expression = input('[!] ').strip() if len(expression) == 0: continue print(' =>', eval(expression)) except (KeyboardInterrupt, EOFError): print() break except Exception as error: print(' => {}: {}'.format(type(error).__name__, error)) finally: cmd += 1 database.terminate(engine, session) # Cleanly shut down SQLAlchemy
def execute(): initialize_logging('../logs/update.log') logger = logging.getLogger() engine, session = database.initialize('sqlite:///../data/isrid-master.db') tasks = [augment_weather_instances] for task in tasks: try: task_name = task.__name__.replace('_', ' ') logger.info('Starting task: {}'.format(task_name)) task(session) except KeyboardInterrupt: print() logger.info('Terminating update ... ') break except Exception as error: logger.error('{}: {}'.format(type(error).__name__, error)) break logging.shutdown() database.terminate(engine, session)
def tearDown(self): database.terminate(self.engine, self.session)
#!/usr/bin/env python3 import matplotlib.pyplot as plt import numpy as np import Orange from pomegranate import * import database from database.models import Subject, Group, Incident, Weather from database.processing import survival_rate, tabulate, export_as_orange engine, session = database.initialize('sqlite:///../data/isrid-master.db') query = session.query(Subject.survived, Group.size, Weather.avg_temp) query = query.join(Group, Incident, Weather) df = tabulate(query, [True, True, True]) database.terminate(engine, session) print(sum(df.survived)/len(df))
def terminate_session(error): if database_initialized(): app.logger.info('Database terminated') database.terminate(g.engine, g.session)
def execute(): matplotlib.rc("font", size=20) engine, session = database.initialize("sqlite:///../data/isrid-master.db") # Query with Group.size may take awhile, at least for Charles # Not sure why query = session.query(Incident.total_hours, Subject.survived, Group.category, Group.size).join(Group, Subject) print("Tabulating query... may take awhile for unknown reasons.") df = tabulate(query) print("Done tabulating.") print(df.describe()) database.terminate(engine, session) df = df.assign( days=[total_hours.total_seconds() / 3600 / 24 for total_hours in df.total_hours], doa=[not survived for survived in df.survived], ) df = df[0 <= df.days] rows, columns = 2, 2 grid, axes = plt.subplots(rows, columns, figsize=(15, 10)) categories = Counter(df.category) plot = 0 kmfs = [] options = {"show_censors": True, "censor_styles": {"marker": "|", "ms": 6}, "censor_ci_force_lines": False} for category, count in categories.most_common()[: rows * columns]: print("Category:", category) ax = axes[plot // columns, plot % columns] df_ = df[df.category == category] N, Ndoa = len(df_), sum(df_.doa) Srate = 100 * (1 - Ndoa / N) grp = df_[df_.size > 1] sng = df_[df_.size == 1] kmf = KaplanMeierFitter() # kmf.fit(df_.days, event_observed=df_.doa, label=category) # kmf.plot(ax=ax, ci_force_lines=True) kmf.fit(grp.days, event_observed=grp.doa, label=category + " Groups") kmf.plot(ax=ax, **options) kmf.fit(sng.days, event_observed=sng.doa, label=category + " Singles") kmf.plot(ax=ax, **options) kmfs.append(kmf) ax.set_xlim(0, min(30, 1.05 * ax.get_xlim()[1])) ax.set_ylim(0, 1) ax.set_title("{}, N = {}, DOA = {}, {:.0f}% surv".format(category, N, Ndoa, Srate)) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") # ax.legend_.remove() # ax.grid(True) plot += 1 grid.suptitle("Kaplan-Meier Survival Curves", fontsize=25) grid.tight_layout() grid.subplots_adjust(top=0.9) grid.savefig("../doc/figures/kaplan-meier/km-grid-large.svg", transparent=True) combined = plt.figure(figsize=(15, 10)) ax = combined.add_subplot(1, 1, 1) for kmf in kmfs[: rows * columns]: kmf.plot(ci_show=False, show_censors=True, censor_styles={"marker": "|", "ms": 6}, ax=ax) ax.set_xlim(0, 15) ax.set_ylim(0, 1) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") ax.set_title("Kaplan-Meier Survival Curves", fontsize=25) ax.grid(True) combined.savefig("../doc/figures/kaplan-meier/km-combined-large.svg", transparent=True) plt.show()
def main(): """ Plot the profile (size and survival rate) of the most common categories. """ ## Read data engine, session = database.initialize('sqlite:///../data/isrid-master.db') query = session.query(Subject.age, Group.category, Subject.survived) query = query.join(Group) df = tabulate(query) database.terminate(engine, session) ## Process subjects by category and age selected_categories = df.category.value_counts()[:10].index.tolist() df = df[df.category.isin(selected_categories)] age_bins = np.linspace(0, 100, 11) survival_rates = np.full((10, 10), np.nan, dtype=np.float64) subgroup_sizes = np.full((10, 10), 0, dtype=np.float64) min_subgroup_size = 10 for category, group in df.groupby('category'): group.insert(len(group.columns), 'age_bin', np.digitize(group.age, age_bins)) for age_index, subgroup in group.groupby('age_bin'): survivals = subgroup.survived.values.tolist() key = age_index - 1, selected_categories.index(category) if len(survivals) > min_subgroup_size: survival_rates[key] = sum(survivals)/len(survivals) subgroup_sizes[key] = len(survivals) # Debugging lower, upper = age_bins[age_index - 1], age_bins[age_index] print('{}, {} - {} years old'.format(category, int(lower), int(upper))) print(' Survival rate: {:.3f}%'.format( 100*survival_rates[key])) print(' Number of subjects: {}'.format( int(subgroup_sizes[key]))) ## Plot survival rates and subgroup sizes canvas = plt.matshow(survival_rates, fignum=False, cmap='RdYlGn', origin='lower') colorbar = plt.colorbar(canvas) colorbar.solids.set_edgecolor('face') colorbar.set_label('Survival Rate') x_positions = y_positions = np.arange(0, 10) for x in x_positions: for y in y_positions: plt.text(x, y, int(subgroup_sizes[y, x]) or '', horizontalalignment='center', verticalalignment='center') plt.title('Lost Person Category Profiles') plt.ylabel('Age (years)') plt.xlabel('Category') ax = plt.gca() ax.xaxis.tick_bottom() plt.yticks(np.linspace(0, 10, 11) - 0.5, age_bins.astype(np.int)) plt.xticks(x_positions, selected_categories, rotation=60) plt.subplots_adjust(bottom=0.2) plt.tight_layout() plt.savefig('../doc/figures/subject-data/category-profiles.svg', transparent=True) plt.show()