예제 #1
0
파일: Age.py 프로젝트: andrewshir/CollIntel
def test(zpoints, data, cdf, rvs, ddof = 1):
    obs_freq = fp.calculate_data_freq(zpoints, data)
    exp_prob = fp.calculate_exp_prob(zpoints, cdf)
    exp_freq = fp.convert_prob_2_freq(exp_prob, len(data))
    print "Observations freq", obs_freq
    print "Expected freq", exp_freq
    print "Run Pearson test"
    chisq, p = stats.chisquare(obs_freq, exp_freq, ddof)
    print "p", p
    print "chisq", chisq
    chi2val = stats.chi2.ppf(0.95, len(obs_freq) - 1 - ddof)
    print "chi2 border value", chi2val
    print "H0 is accepted" if chisq < chi2val else "H0 is rejected "

    obs_data = data
    obs_data.sort()
    rand_data = rvs(len(obs_data))
    rand_data = [x if x > 1 else 1 for x in rand_data]
    rand_data.sort()

    x_values = xrange(len(obs_data))
    plt.ylabel("age")
    plt.plot(obs_data, 'b-')
    plt.plot(rand_data, 'ro')
    plt.show()
예제 #2
0
파일: Los.py 프로젝트: andrewshir/CollIntel
def get_data((sex, age, sline)):
    result = []
    for row in all_data:
        admit_date = row[2]
        agef = fp.split_age(int(row[9]))
        sexf = int(row[10])
        slinef = row[14]
        soif = row[8]
        rlos = row[5]

        if slinef is None:
            continue
        if len(admit_date) == 0:
            continue
        if len(rlos) == 0:
            continue
        if len(soif) == 0:
            continue

        if (sex, age, sline) != (sexf, agef, slinef):
            continue

        if int(soif) > 2:
            continue
        datetime = fp.parse_datetime(admit_date)

        result.append(int(rlos))
    return result
예제 #3
0
 def print_freq(data):
     freq = {}
     length = float(len(data))
     for x in data:
         xcat = fp.split_age(x)
         freq.setdefault(xcat, 0)
         freq[xcat] += 1
     for x in sorted(freq.keys()):
         print "%d: %.2f" % (x, round(freq[x]/length, 2)),
     print
예제 #4
0
def train_rlos(data, show_chart=False):
    """Train LOS estimator"""
    """Train patient LOS for triplet (sex, age, sline)"""
    freq = {}
    for row in data:
        sex = int(row["sex"])
        age = fp.split_age(int(row["age"]))
        sline = row["sline"]
        rlos = int(row["rlos"])

        if rlos == 0:
            print "RLOS equals zero for sex %d, age %d, SL %s" % (sex, age, sline)

        tuple = (sex, age, sline)
        freq.setdefault(tuple, [])
        freq[tuple].append(rlos)

    result = {}
    for tuple, train_data in freq.items():
        (sex, age, sline) = tuple
        if len(train_data) < training_threshold:
            print "Too small training set (<%d) for sex %d, age %d, SL %s. Data will be skipped. " % \
                  (training_threshold, sex, age, sline)
            continue

        X = np.array([train_data]).transpose()
        kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(X)
        kdef = lambda size: [round(l[0]) for l in kde.sample(size).tolist()]
        result[tuple] = kde

        if show_chart:
            # print "Sex=%d, Age=%d, SL=%s" % (sex, age, sline)
            # print_freq(ages)
            samples = kdef(len(train_data)) if len(train_data) < 500 else kdef(500)
            # print_freq(samples)

            # hist for train data
            plt.subplot(211)
            plt.title("RLOS train data for Sex=%d, Age=%d, SL=%s" % (sex, age, sline))
            plt.ylabel('freq')
            plt.xlabel('RLOS')
            plt.hist(train_data)

            # estimated density
            plt.subplot(212)
            plt.title("Estimated density Sex=%d, Age=%d, SL=%s" % (sex, age, sline))
            plt.ylabel('freq')
            plt.xlabel('RLOS')
            plt.hist(samples)

            plt.show()

    return result
예제 #5
0
def train_admit_count(data, show_chart=False):
    """Train patient admittance number for triplet (sex, age, sline)"""
    freq = {}
    for row in data:
        sex = int(row["sex"])
        age = fp.split_age(int(row["age"]))
        sline = row["sline"]
        admit = row["admit"]

        tuple = (sex, age, sline)
        freq.setdefault(tuple, {})
        freq[tuple].setdefault(admit, 0)
        freq[tuple][admit] += 1

    result = {}
    for tuple, days in freq.items():
        (sex, age, sline) = tuple
        train_data = days.values()
        if len(train_data) < training_threshold:
            print "Too small training set (<%d) for sex %d, age %d, SL %s. Data will be skipped. " % \
                  (training_threshold, sex, age, sline)
            continue

        X = np.array([train_data]).transpose()
        kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(X)
        kdef = lambda size: [int(round(l[0])) for l in kde.sample(size).tolist()]
        result[tuple] = kde

        if show_chart:
            # print "Sex=%d, Age=%d, SL=%s" % (sex, age, sline)
            # print_freq(ages)
            samples = kdef(len(train_data)) if len(train_data) < 500 else kdef(500)
            # print_freq(samples)

            # hist for train data
            plt.subplot(211)
            plt.title("Admit count train data for Sex=%d, Age=%d, SL=%s" % (sex, age, sline))
            plt.ylabel('freq')
            plt.xlabel('admittance count')
            plt.hist(train_data)

            # estimated density
            plt.subplot(212)
            plt.title("Estimated density Sex=%d, Age=%d, SL=%s" % (sex, age, sline))
            plt.ylabel('freq')
            plt.xlabel('admittance count')
            plt.hist(samples)

            plt.show()

    return result
예제 #6
0
def build_chart(generated_data):
    """Builds charts of freq differences between average model and historical data"""
    freqs_model = {}
    freqs_history = {}
    for row in generated_data:
        id = row[0]
        sex = row[2]
        age = row[3]
        sline = row[4]
        rlos = row[5]
        tuple = (sex, fp.split_age(age), sline)

        if id[0] == 'M':
            freqs_model.setdefault(tuple, {})
            freqs_model[tuple].setdefault(rlos, {})
            freqs_model[tuple][rlos].setdefault(id, 0)
            freqs_model[tuple][rlos][id] += 1
        else:
            freqs_history.setdefault(tuple, {})
            freqs_history[tuple].setdefault(rlos, {})
            freqs_history[tuple][rlos].setdefault(id, 0)
            freqs_history[tuple][rlos][id] += 1

    # calculate average freqs
    freqs_avg_model = {}
    freqs_avg_history = {}
    for tuple in freqs_model.keys():
        rt = {}
        for rlos in freqs_model[tuple].keys():
            d = freqs_model[tuple][rlos]
            rt[rlos] = sum(d.values()) / float(len(d))
        freqs_avg_model[tuple] = rt
    for tuple in freqs_history.keys():
        rt = {}
        for rlos in freqs_history[tuple].keys():
            d = freqs_history[tuple][rlos]
            rt[rlos] = sum(d.values()) / float(len(d))
        freqs_avg_history[tuple] = rt

    plot_data = {}
    for tuple in freqs_avg_model.keys():
        fm = freqs_avg_model[tuple]
        if tuple not in freqs_avg_history:
            print "Cannot find history data to compare with model for sex: %d, age %d, sline %s" % tuple
        fh = freqs_avg_history[tuple]
        plot_data[tuple] = calculate_distance(fm, fh)

    plt.title("Difference between average modeled and historic data")
    plt.plot(sorted(plot_data.values()), 'ro')
    plt.show()
예제 #7
0
__author__ = 'Andrew'
import FakePatients as fp
import time
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
from FakePatients import split_age

all_data = fp.load_data_with_sline()
# Here are sline which are presented almost every day (at least 360 days in year )
fit_sline = ['276', '070', '090', '390', '274', '135', '129', '250', '050', '255', '280', '283', '145', '065', '085'
             , '245', '262', '267', '125', '387', '165', '132', '296']
YEAR = 2012
FULL_YEARS = [2011, 2012, 2013]

def show_sline_freq(year=YEAR):
    """Prints sline counts for a year"""
    sline_freq = {}
    for tuple in all_data:
        sline = tuple[14]
        if sline is None:
            continue

        admit_date = tuple[2]
        if len(admit_date) == 0:
            continue

        datetime = time.strptime(admit_date, "%Y-%m-%d")
        if int(datetime.tm_year) != year:
            continue
예제 #8
0
__author__ = 'Andrew'
import FakePatients as fp
import matplotlib.pyplot as plt
import math
import scipy.stats as stats

print "Autumn"
low_filter = 180
zpoints = [216, 238, 261]

workdays, holidays = fp.load_data()
print "Workdays loaded:", len(workdays)
print "Holidays loaded:", len(holidays)
print

s1, s2, s3, s4 = fp.get_season_data(workdays)

#fp.plot_seasons_data(s1, s2, s3, s4)
alldata = s4
data = [x for x in alldata if x > low_filter]
obs_freq = fp.calculate_data_freq(zpoints, data)

print "Observations freq", obs_freq
nobs, (min, max), mean, variance, s, k = stats.describe(data)
std = math.sqrt(variance)
print "Nobs", nobs
print "Mean", mean
print "Variance", variance
print

exp_prob = fp.calculate_exp_prob(zpoints, lambda x: stats.norm.cdf(x, mean, std))
예제 #9
0
def predict_patient_flow(ages_estimator, admit_count_estimator, rlos_estimator, day_patients_prob,
                         model_count=1, history_count=1, sline_list=None, days=30):
    if sline_list is None:
        sline_list = []
        for common_sline in ages_estimator.keys():
            found = False
            for sex, age, sline in admit_count_estimator.keys():
                if sline == common_sline:
                    found = True
                    break
            if not found:
                continue
            found = False
            for sex, age, sline in rlos_estimator.keys():
                if sline == common_sline:
                    found = True
                    break
            if not found:
                continue
            found = False
            for sex, age, sline in day_patients_prob.keys():
                if sline == common_sline:
                    found = True
                    break
            if not found:
                continue
            sline_list.append(sline)

    # dataset indexes to make dataset identifiers
    model_index = 1
    history_index = 1

    result = []
    for sline in sline_list:
        for sex in [2, 3]:
            for age in [2, 3, 4, 5]:
                tuple = (sex, age, sline)

                if tuple not in admit_count_estimator \
                        or tuple not in rlos_estimator \
                        or tuple not in day_patients_prob:
                    print "Cannot find all estimations for sex %d, age %d, SL %s" % tuple
                    continue

                # add historic data
                for it in xrange(history_count):
                    result.extend(historic_data(tuple, days))
                    history_index += 1

                # model patient flow
                for it in xrange(model_count):
                    rlos_flow_func = lambda: [int(round(l[0]))
                                              for l in rlos_estimator[tuple].sample(100).tolist()]
                    rlos_flow = rlos_flow_func()
                    age_flow_func = lambda: [a for a in ages_estimator[sline](500) if fp.split_age(a) == age]
                    age_flow = recall_if_empty(age_flow_func)
                    admit_count_func = lambda: [int(round(l[0]))
                                                for l in admit_count_estimator[tuple].sample(100).tolist()]
                    admit_flow = admit_count_func()

                    for iday in xrange(days):
                        if day_patients_prob[tuple] == 1.0 or random.random() <= day_patients_prob[tuple]:
                            pat_count = admit_flow.pop()
                            if len(admit_flow) == 0:
                                admit_flow = admit_count_func()
                            for p in xrange(pat_count):
                                id = "M%02d (%d, %d, %s)" % (model_index, sex, age, sline)
                                result.append(
                                    (id, str(iday+1), sex, age_flow.pop(), sline, rlos_flow.pop()))
                                if len(rlos_flow) == 0:
                                    rlos_flow = rlos_flow_func()
                                if len(age_flow) == 0:
                                    age_flow = age_flow_func()

                    model_index += 1
    return result
예제 #10
0
def calc_day_patients_prob():
    return fp.get_patients_freq(raw_data)
예제 #11
0
__author__ = 'Andrew'
import FakePatients as fp
import matplotlib.pyplot as plt
from sklearn.neighbors.kde import KernelDensity
import numpy as np
import random
import csv
from datetime import timedelta
import math

training_threshold = 10
alert_count = 50
raw_data, missed_drg = fp.load_data_with_sline()
data = fp.change_to_dict(fp.filter_incomplete_data(raw_data))
print "Rows with following DRG were skipped:",
print missed_drg
print "Filter out %d of %d" % (len(raw_data) - len(data), len(raw_data))

def history(all_data, (sex, age, sline), days=30):
    """Return historical data for selected combination of (sex, age, sline)"""
    start_date = None
    end_date = None
    hist_data = {}
    for row in all_data:
        admit_date = row[2]
        agef_in_years = int(row[9])
        agef = fp.split_age(agef_in_years)
        sexf = int(row[10])
        slinef = row[14]
        rlos = row[5]