def test(zpoints, data, cdf, rvs, ddof = 1): obs_freq = fp.calculate_data_freq(zpoints, data) exp_prob = fp.calculate_exp_prob(zpoints, cdf) exp_freq = fp.convert_prob_2_freq(exp_prob, len(data)) print "Observations freq", obs_freq print "Expected freq", exp_freq print "Run Pearson test" chisq, p = stats.chisquare(obs_freq, exp_freq, ddof) print "p", p print "chisq", chisq chi2val = stats.chi2.ppf(0.95, len(obs_freq) - 1 - ddof) print "chi2 border value", chi2val print "H0 is accepted" if chisq < chi2val else "H0 is rejected " obs_data = data obs_data.sort() rand_data = rvs(len(obs_data)) rand_data = [x if x > 1 else 1 for x in rand_data] rand_data.sort() x_values = xrange(len(obs_data)) plt.ylabel("age") plt.plot(obs_data, 'b-') plt.plot(rand_data, 'ro') plt.show()
def get_data((sex, age, sline)): result = [] for row in all_data: admit_date = row[2] agef = fp.split_age(int(row[9])) sexf = int(row[10]) slinef = row[14] soif = row[8] rlos = row[5] if slinef is None: continue if len(admit_date) == 0: continue if len(rlos) == 0: continue if len(soif) == 0: continue if (sex, age, sline) != (sexf, agef, slinef): continue if int(soif) > 2: continue datetime = fp.parse_datetime(admit_date) result.append(int(rlos)) return result
def print_freq(data): freq = {} length = float(len(data)) for x in data: xcat = fp.split_age(x) freq.setdefault(xcat, 0) freq[xcat] += 1 for x in sorted(freq.keys()): print "%d: %.2f" % (x, round(freq[x]/length, 2)), print
def train_rlos(data, show_chart=False): """Train LOS estimator""" """Train patient LOS for triplet (sex, age, sline)""" freq = {} for row in data: sex = int(row["sex"]) age = fp.split_age(int(row["age"])) sline = row["sline"] rlos = int(row["rlos"]) if rlos == 0: print "RLOS equals zero for sex %d, age %d, SL %s" % (sex, age, sline) tuple = (sex, age, sline) freq.setdefault(tuple, []) freq[tuple].append(rlos) result = {} for tuple, train_data in freq.items(): (sex, age, sline) = tuple if len(train_data) < training_threshold: print "Too small training set (<%d) for sex %d, age %d, SL %s. Data will be skipped. " % \ (training_threshold, sex, age, sline) continue X = np.array([train_data]).transpose() kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(X) kdef = lambda size: [round(l[0]) for l in kde.sample(size).tolist()] result[tuple] = kde if show_chart: # print "Sex=%d, Age=%d, SL=%s" % (sex, age, sline) # print_freq(ages) samples = kdef(len(train_data)) if len(train_data) < 500 else kdef(500) # print_freq(samples) # hist for train data plt.subplot(211) plt.title("RLOS train data for Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('RLOS') plt.hist(train_data) # estimated density plt.subplot(212) plt.title("Estimated density Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('RLOS') plt.hist(samples) plt.show() return result
def train_admit_count(data, show_chart=False): """Train patient admittance number for triplet (sex, age, sline)""" freq = {} for row in data: sex = int(row["sex"]) age = fp.split_age(int(row["age"])) sline = row["sline"] admit = row["admit"] tuple = (sex, age, sline) freq.setdefault(tuple, {}) freq[tuple].setdefault(admit, 0) freq[tuple][admit] += 1 result = {} for tuple, days in freq.items(): (sex, age, sline) = tuple train_data = days.values() if len(train_data) < training_threshold: print "Too small training set (<%d) for sex %d, age %d, SL %s. Data will be skipped. " % \ (training_threshold, sex, age, sline) continue X = np.array([train_data]).transpose() kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(X) kdef = lambda size: [int(round(l[0])) for l in kde.sample(size).tolist()] result[tuple] = kde if show_chart: # print "Sex=%d, Age=%d, SL=%s" % (sex, age, sline) # print_freq(ages) samples = kdef(len(train_data)) if len(train_data) < 500 else kdef(500) # print_freq(samples) # hist for train data plt.subplot(211) plt.title("Admit count train data for Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('admittance count') plt.hist(train_data) # estimated density plt.subplot(212) plt.title("Estimated density Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('admittance count') plt.hist(samples) plt.show() return result
def build_chart(generated_data): """Builds charts of freq differences between average model and historical data""" freqs_model = {} freqs_history = {} for row in generated_data: id = row[0] sex = row[2] age = row[3] sline = row[4] rlos = row[5] tuple = (sex, fp.split_age(age), sline) if id[0] == 'M': freqs_model.setdefault(tuple, {}) freqs_model[tuple].setdefault(rlos, {}) freqs_model[tuple][rlos].setdefault(id, 0) freqs_model[tuple][rlos][id] += 1 else: freqs_history.setdefault(tuple, {}) freqs_history[tuple].setdefault(rlos, {}) freqs_history[tuple][rlos].setdefault(id, 0) freqs_history[tuple][rlos][id] += 1 # calculate average freqs freqs_avg_model = {} freqs_avg_history = {} for tuple in freqs_model.keys(): rt = {} for rlos in freqs_model[tuple].keys(): d = freqs_model[tuple][rlos] rt[rlos] = sum(d.values()) / float(len(d)) freqs_avg_model[tuple] = rt for tuple in freqs_history.keys(): rt = {} for rlos in freqs_history[tuple].keys(): d = freqs_history[tuple][rlos] rt[rlos] = sum(d.values()) / float(len(d)) freqs_avg_history[tuple] = rt plot_data = {} for tuple in freqs_avg_model.keys(): fm = freqs_avg_model[tuple] if tuple not in freqs_avg_history: print "Cannot find history data to compare with model for sex: %d, age %d, sline %s" % tuple fh = freqs_avg_history[tuple] plot_data[tuple] = calculate_distance(fm, fh) plt.title("Difference between average modeled and historic data") plt.plot(sorted(plot_data.values()), 'ro') plt.show()
__author__ = 'Andrew' import FakePatients as fp import time import matplotlib.pyplot as plt import scipy.stats as stats import math from FakePatients import split_age all_data = fp.load_data_with_sline() # Here are sline which are presented almost every day (at least 360 days in year ) fit_sline = ['276', '070', '090', '390', '274', '135', '129', '250', '050', '255', '280', '283', '145', '065', '085' , '245', '262', '267', '125', '387', '165', '132', '296'] YEAR = 2012 FULL_YEARS = [2011, 2012, 2013] def show_sline_freq(year=YEAR): """Prints sline counts for a year""" sline_freq = {} for tuple in all_data: sline = tuple[14] if sline is None: continue admit_date = tuple[2] if len(admit_date) == 0: continue datetime = time.strptime(admit_date, "%Y-%m-%d") if int(datetime.tm_year) != year: continue
__author__ = 'Andrew' import FakePatients as fp import matplotlib.pyplot as plt import math import scipy.stats as stats print "Autumn" low_filter = 180 zpoints = [216, 238, 261] workdays, holidays = fp.load_data() print "Workdays loaded:", len(workdays) print "Holidays loaded:", len(holidays) print s1, s2, s3, s4 = fp.get_season_data(workdays) #fp.plot_seasons_data(s1, s2, s3, s4) alldata = s4 data = [x for x in alldata if x > low_filter] obs_freq = fp.calculate_data_freq(zpoints, data) print "Observations freq", obs_freq nobs, (min, max), mean, variance, s, k = stats.describe(data) std = math.sqrt(variance) print "Nobs", nobs print "Mean", mean print "Variance", variance print exp_prob = fp.calculate_exp_prob(zpoints, lambda x: stats.norm.cdf(x, mean, std))
def predict_patient_flow(ages_estimator, admit_count_estimator, rlos_estimator, day_patients_prob, model_count=1, history_count=1, sline_list=None, days=30): if sline_list is None: sline_list = [] for common_sline in ages_estimator.keys(): found = False for sex, age, sline in admit_count_estimator.keys(): if sline == common_sline: found = True break if not found: continue found = False for sex, age, sline in rlos_estimator.keys(): if sline == common_sline: found = True break if not found: continue found = False for sex, age, sline in day_patients_prob.keys(): if sline == common_sline: found = True break if not found: continue sline_list.append(sline) # dataset indexes to make dataset identifiers model_index = 1 history_index = 1 result = [] for sline in sline_list: for sex in [2, 3]: for age in [2, 3, 4, 5]: tuple = (sex, age, sline) if tuple not in admit_count_estimator \ or tuple not in rlos_estimator \ or tuple not in day_patients_prob: print "Cannot find all estimations for sex %d, age %d, SL %s" % tuple continue # add historic data for it in xrange(history_count): result.extend(historic_data(tuple, days)) history_index += 1 # model patient flow for it in xrange(model_count): rlos_flow_func = lambda: [int(round(l[0])) for l in rlos_estimator[tuple].sample(100).tolist()] rlos_flow = rlos_flow_func() age_flow_func = lambda: [a for a in ages_estimator[sline](500) if fp.split_age(a) == age] age_flow = recall_if_empty(age_flow_func) admit_count_func = lambda: [int(round(l[0])) for l in admit_count_estimator[tuple].sample(100).tolist()] admit_flow = admit_count_func() for iday in xrange(days): if day_patients_prob[tuple] == 1.0 or random.random() <= day_patients_prob[tuple]: pat_count = admit_flow.pop() if len(admit_flow) == 0: admit_flow = admit_count_func() for p in xrange(pat_count): id = "M%02d (%d, %d, %s)" % (model_index, sex, age, sline) result.append( (id, str(iday+1), sex, age_flow.pop(), sline, rlos_flow.pop())) if len(rlos_flow) == 0: rlos_flow = rlos_flow_func() if len(age_flow) == 0: age_flow = age_flow_func() model_index += 1 return result
def calc_day_patients_prob(): return fp.get_patients_freq(raw_data)
__author__ = 'Andrew' import FakePatients as fp import matplotlib.pyplot as plt from sklearn.neighbors.kde import KernelDensity import numpy as np import random import csv from datetime import timedelta import math training_threshold = 10 alert_count = 50 raw_data, missed_drg = fp.load_data_with_sline() data = fp.change_to_dict(fp.filter_incomplete_data(raw_data)) print "Rows with following DRG were skipped:", print missed_drg print "Filter out %d of %d" % (len(raw_data) - len(data), len(raw_data)) def history(all_data, (sex, age, sline), days=30): """Return historical data for selected combination of (sex, age, sline)""" start_date = None end_date = None hist_data = {} for row in all_data: admit_date = row[2] agef_in_years = int(row[9]) agef = fp.split_age(agef_in_years) sexf = int(row[10]) slinef = row[14] rlos = row[5]