def get_data((sex, age, sline)): result = [] for row in all_data: admit_date = row[2] agef = fp.split_age(int(row[9])) sexf = int(row[10]) slinef = row[14] soif = row[8] rlos = row[5] if slinef is None: continue if len(admit_date) == 0: continue if len(rlos) == 0: continue if len(soif) == 0: continue if (sex, age, sline) != (sexf, agef, slinef): continue if int(soif) > 2: continue datetime = fp.parse_datetime(admit_date) result.append(int(rlos)) return result
def print_freq(data): freq = {} length = float(len(data)) for x in data: xcat = fp.split_age(x) freq.setdefault(xcat, 0) freq[xcat] += 1 for x in sorted(freq.keys()): print "%d: %.2f" % (x, round(freq[x]/length, 2)), print
def train_rlos(data, show_chart=False): """Train LOS estimator""" """Train patient LOS for triplet (sex, age, sline)""" freq = {} for row in data: sex = int(row["sex"]) age = fp.split_age(int(row["age"])) sline = row["sline"] rlos = int(row["rlos"]) if rlos == 0: print "RLOS equals zero for sex %d, age %d, SL %s" % (sex, age, sline) tuple = (sex, age, sline) freq.setdefault(tuple, []) freq[tuple].append(rlos) result = {} for tuple, train_data in freq.items(): (sex, age, sline) = tuple if len(train_data) < training_threshold: print "Too small training set (<%d) for sex %d, age %d, SL %s. Data will be skipped. " % \ (training_threshold, sex, age, sline) continue X = np.array([train_data]).transpose() kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(X) kdef = lambda size: [round(l[0]) for l in kde.sample(size).tolist()] result[tuple] = kde if show_chart: # print "Sex=%d, Age=%d, SL=%s" % (sex, age, sline) # print_freq(ages) samples = kdef(len(train_data)) if len(train_data) < 500 else kdef(500) # print_freq(samples) # hist for train data plt.subplot(211) plt.title("RLOS train data for Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('RLOS') plt.hist(train_data) # estimated density plt.subplot(212) plt.title("Estimated density Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('RLOS') plt.hist(samples) plt.show() return result
def train_admit_count(data, show_chart=False): """Train patient admittance number for triplet (sex, age, sline)""" freq = {} for row in data: sex = int(row["sex"]) age = fp.split_age(int(row["age"])) sline = row["sline"] admit = row["admit"] tuple = (sex, age, sline) freq.setdefault(tuple, {}) freq[tuple].setdefault(admit, 0) freq[tuple][admit] += 1 result = {} for tuple, days in freq.items(): (sex, age, sline) = tuple train_data = days.values() if len(train_data) < training_threshold: print "Too small training set (<%d) for sex %d, age %d, SL %s. Data will be skipped. " % \ (training_threshold, sex, age, sline) continue X = np.array([train_data]).transpose() kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(X) kdef = lambda size: [int(round(l[0])) for l in kde.sample(size).tolist()] result[tuple] = kde if show_chart: # print "Sex=%d, Age=%d, SL=%s" % (sex, age, sline) # print_freq(ages) samples = kdef(len(train_data)) if len(train_data) < 500 else kdef(500) # print_freq(samples) # hist for train data plt.subplot(211) plt.title("Admit count train data for Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('admittance count') plt.hist(train_data) # estimated density plt.subplot(212) plt.title("Estimated density Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('admittance count') plt.hist(samples) plt.show() return result
def build_chart(generated_data): """Builds charts of freq differences between average model and historical data""" freqs_model = {} freqs_history = {} for row in generated_data: id = row[0] sex = row[2] age = row[3] sline = row[4] rlos = row[5] tuple = (sex, fp.split_age(age), sline) if id[0] == 'M': freqs_model.setdefault(tuple, {}) freqs_model[tuple].setdefault(rlos, {}) freqs_model[tuple][rlos].setdefault(id, 0) freqs_model[tuple][rlos][id] += 1 else: freqs_history.setdefault(tuple, {}) freqs_history[tuple].setdefault(rlos, {}) freqs_history[tuple][rlos].setdefault(id, 0) freqs_history[tuple][rlos][id] += 1 # calculate average freqs freqs_avg_model = {} freqs_avg_history = {} for tuple in freqs_model.keys(): rt = {} for rlos in freqs_model[tuple].keys(): d = freqs_model[tuple][rlos] rt[rlos] = sum(d.values()) / float(len(d)) freqs_avg_model[tuple] = rt for tuple in freqs_history.keys(): rt = {} for rlos in freqs_history[tuple].keys(): d = freqs_history[tuple][rlos] rt[rlos] = sum(d.values()) / float(len(d)) freqs_avg_history[tuple] = rt plot_data = {} for tuple in freqs_avg_model.keys(): fm = freqs_avg_model[tuple] if tuple not in freqs_avg_history: print "Cannot find history data to compare with model for sex: %d, age %d, sline %s" % tuple fh = freqs_avg_history[tuple] plot_data[tuple] = calculate_distance(fm, fh) plt.title("Difference between average modeled and historic data") plt.plot(sorted(plot_data.values()), 'ro') plt.show()
def predict_patient_flow(ages_estimator, admit_count_estimator, rlos_estimator, day_patients_prob, model_count=1, history_count=1, sline_list=None, days=30): if sline_list is None: sline_list = [] for common_sline in ages_estimator.keys(): found = False for sex, age, sline in admit_count_estimator.keys(): if sline == common_sline: found = True break if not found: continue found = False for sex, age, sline in rlos_estimator.keys(): if sline == common_sline: found = True break if not found: continue found = False for sex, age, sline in day_patients_prob.keys(): if sline == common_sline: found = True break if not found: continue sline_list.append(sline) # dataset indexes to make dataset identifiers model_index = 1 history_index = 1 result = [] for sline in sline_list: for sex in [2, 3]: for age in [2, 3, 4, 5]: tuple = (sex, age, sline) if tuple not in admit_count_estimator \ or tuple not in rlos_estimator \ or tuple not in day_patients_prob: print "Cannot find all estimations for sex %d, age %d, SL %s" % tuple continue # add historic data for it in xrange(history_count): result.extend(historic_data(tuple, days)) history_index += 1 # model patient flow for it in xrange(model_count): rlos_flow_func = lambda: [int(round(l[0])) for l in rlos_estimator[tuple].sample(100).tolist()] rlos_flow = rlos_flow_func() age_flow_func = lambda: [a for a in ages_estimator[sline](500) if fp.split_age(a) == age] age_flow = recall_if_empty(age_flow_func) admit_count_func = lambda: [int(round(l[0])) for l in admit_count_estimator[tuple].sample(100).tolist()] admit_flow = admit_count_func() for iday in xrange(days): if day_patients_prob[tuple] == 1.0 or random.random() <= day_patients_prob[tuple]: pat_count = admit_flow.pop() if len(admit_flow) == 0: admit_flow = admit_count_func() for p in xrange(pat_count): id = "M%02d (%d, %d, %s)" % (model_index, sex, age, sline) result.append( (id, str(iday+1), sex, age_flow.pop(), sline, rlos_flow.pop())) if len(rlos_flow) == 0: rlos_flow = rlos_flow_func() if len(age_flow) == 0: age_flow = age_flow_func() model_index += 1 return result
alert_count = 50 raw_data, missed_drg = fp.load_data_with_sline() data = fp.change_to_dict(fp.filter_incomplete_data(raw_data)) print "Rows with following DRG were skipped:", print missed_drg print "Filter out %d of %d" % (len(raw_data) - len(data), len(raw_data)) def history(all_data, (sex, age, sline), days=30): """Return historical data for selected combination of (sex, age, sline)""" start_date = None end_date = None hist_data = {} for row in all_data: admit_date = row[2] agef_in_years = int(row[9]) agef = fp.split_age(agef_in_years) sexf = int(row[10]) slinef = row[14] rlos = row[5] if slinef is None: continue if len(admit_date) == 0: continue if len(rlos) == 0: continue if (sex, age, sline) != (sexf, agef, slinef): continue datetime = fp.parse_datetime(admit_date)