コード例 #1
0
def test_surgery_usage_regression_df():
    case_service = "Cardiac Surgery"

    analytics = ScmAnalytics.ScmAnalytics(lhs_config)
    surgery_df = analytics.surgery_df
    usage_df = analytics.usage_df
    item_ids = ["38242", "129636"]

    surgery_df = surgery_df[surgery_df["case_service"] == case_service]
    usage_df = usage_df[usage_df["case_service"] == case_service]
    surgery_df = surgery_df[surgery_df["event_id"].isin(
        set(usage_df["event_id"]))]

    all_procedures = set.union(*surgery_df["procedures"])
    r_df = SURegressionModel.surgery_usage_regression_df(surgery_df,
                                                         usage_df,
                                                         item_ids=item_ids)

    # print(r_df.iloc[0])

    interactions = [("cabg double", "esvh"), ("ita", "esvh")]
    features = ["cabg double", "esvh", "ita", "cabg single"]

    # print(all_procedures)
    x, feature_df = SURegressionModel.extract_features(r_df, features,
                                                       all_procedures,
                                                       interactions)
    usages = list(day_df["real_usage"])
    default_day_df["real_usage"] = list(
        np.random.choice(usages) for i in range(len(default_day_df)))
    default_day_df[
        "change"] = day_df["received_qty"] - default_day_df["real_usage"]
    default_day_df["inventory_level"] = default_day_df["change"].cumsum()
    default_day_df["inventory_level"] = default_day_df[
        "inventory_level"] + initial_inventory
    return default_day_df["inventory_level"]


case_service = "Cardiac Surgery"
item_id = "38242"
trials = 1

analytics = ScmAnalytics.ScmAnalytics(lhs_config)
surgery_df = analytics.surgery_df
usage_df = analytics.usage_df
item_ids = [item_id]

surgery_df = surgery_df[surgery_df["case_service"] == case_service]
surgery_df = surgery_df.drop_duplicates("event_id", keep="last")
usage_df = usage_df[usage_df["case_service"] == case_service]
surgery_df = surgery_df[surgery_df["event_id"].isin(set(usage_df["event_id"]))]
surgery_df["procedures"] = surgery_df["procedures"].apply(
    lambda x: set(e.replace(" ", "_") for e in x))

all_procedures = set.union(*surgery_df["procedures"])
r_df = SURegressionModel.surgery_usage_regression_df(surgery_df,
                                                     usage_df,
                                                     item_ids=item_ids)
def run(case_service="Cardiac Surgery", item_id="1686"):
    analytics = ScmAnalytics.ScmAnalytics(lhs_config)
    case_service_filter = [{
        "dim": "case_service",
        "op": "eq",
        "val": case_service
    }]

    usage_df = analytics.usage_df
    usage_df = usage_df[usage_df["start_date"].notna()]
    usage_df = Analytics.process_filters(usage_df, filters=case_service_filter)
    usage_events = set(usage_df["event_id"])
    item_usage_df = usage_df[usage_df["item_id"] == item_id]

    surgery_df = pre_process_columns(analytics.surgery_df)
    surgery_df = surgery_df[surgery_df["start_date"].notna()]
    surgery_df = surgery_df[
        surgery_df["start_date"] > datetime.date(2016, 1, 1)]
    surgery_df = Analytics.process_filters(surgery_df,
                                           filters=case_service_filter)
    surgery_df = surgery_df[surgery_df["event_id"].isin(usage_events)]

    surgery_df = surgery_df.join(
        item_usage_df.set_index("event_id")[["used_qty"]],
        on="event_id",
        how="left").fillna(0)
    surgery_df["procedures"] = surgery_df["procedures"].apply(
        lambda x: frozenset(x))

    usage_dist = surgery_df.groupby(["procedures"]).agg({
        "used_qty":
        lambda x: list(x)
    }).reset_index()
    usage_dist["occurrences"] = usage_dist["used_qty"].apply(lambda x: len(x))
    usage_dist = usage_dist[usage_dist["occurrences"] > 25]
    usage_dist["mean"] = usage_dist["used_qty"].apply(lambda x: np.mean(x))
    usage_dist["variance"] = usage_dist["used_qty"].apply(
        lambda x: np.var(x, ddof=1))
    usage_dist["var/mean"] = usage_dist["variance"] / usage_dist["mean"]

    df = surgery_df[surgery_df["procedures"].isin(
        usage_dist["procedures"])][["start_date", "used_qty"]]
    rolling_df = df[["used_qty"]].rolling(100).mean()
    plt.plot(list(rolling_df["used_qty"]))
    rolling_df = df[["used_qty"]].rolling(50).mean()
    plt.plot(list(rolling_df["used_qty"]))
    plt.savefig("{}_rolling_usage.png".format(item_id), format="png")

    traces = []
    x_max = 0
    for i in range(len(usage_dist)):
        case = usage_dist.iloc[i]["procedures"]
        data = usage_dist.iloc[i]["used_qty"]
        label = ", ".join(case)
        end = max(usage_dist.iloc[i]["used_qty"]) + 1
        traces.append(
            go.Histogram(x=data,
                         name=label,
                         xbins=dict(start=0, end=end, size=1),
                         histnorm='probability',
                         opacity=0.75))
        x_max = int(end) if end > x_max else x_max
def boostrap_info_process(item_id="38242"):
    case_service = "Cardiac Surgery"
    #item_id = "3824ns_info_state_rvs2"
    info_granularity = 1
    eps_trunk = 1e-3

    elective_outdir = "scm_implementation/ns_info_state_rvs/elective"
    emergency_outdir = "scm_implementation/ns_info_state_rvs/emergency"

    analytics = ScmAnalytics.ScmAnalytics(lhs_config)

    filters = [{
        "dim": "case_service",
        "op": "eq",
        "val": case_service
    }, {
        "dim": "urgent_elective",
        "op": "eq",
        "val": "Elective"
    }]
    elective_filter = [{
        "dim": "urgent_elective",
        "op": "eq",
        "val": "Elective"
    }]
    emergency_filter = [{
        "dim": "urgent_elective",
        "op": "eq",
        "val": "Urgent"
    }]
    case_service_filter = [{
        "dim": "case_service",
        "op": "eq",
        "val": case_service
    }]

    surgery_df = pre_process_columns(analytics.surgery_df)
    surgery_df = surgery_df[surgery_df["start_date"].notna()]
    surgery_df = surgery_df[
        surgery_df["start_date"] > datetime.date(2016, 1, 1)]
    surgery_df = Analytics.process_filters(surgery_df,
                                           filters=elective_filter +
                                           case_service_filter)
    dist_df = surgeries_per_day_distribution(surgery_df,
                                             day_group_by="is_weekday",
                                             filters=[])
    data = dist_df.set_index("is_weekday").loc[True]["data"]
    bins = range(1 + int(max(data)))
    binom_x = [x + 0.5 for x in bins]
    n = int(max(data))
    p = np.mean(data) / n

    surgery_df = pre_process_columns(analytics.surgery_df)
    surgery_df = surgery_df[surgery_df["start_date"].notna()]
    surgery_df = surgery_df[
        surgery_df["start_date"] > datetime.date(2016, 1, 1)]
    surgery_df = Analytics.process_filters(surgery_df,
                                           filters=emergency_filter +
                                           case_service_filter)
    dist_df = surgeries_per_day_distribution(surgery_df, filters=[])
    emergency_surgeries_mean = np.mean(dist_df)

    surgery_df = Analytics.process_filters(analytics.surgery_df,
                                           filters=case_service_filter)
    surgery_df["procedure_count"] = surgery_df["procedures"].apply(
        lambda x: len(x))
    procedure_count_df = surgery_df.groupby("procedure_count").agg({
        "event_id":
        "count"
    }).reset_index()
    procedure_count_df = procedure_count_df[
        procedure_count_df["procedure_count"] != 6]
    procedure_count_df["p"] = procedure_count_df["procedure_count"] / sum(
        procedure_count_df["procedure_count"])
    procedure_count_rv = pacal.DiscreteDistr(
        procedure_count_df["procedure_count"], procedure_count_df["p"])
    """
    Procedure weights
    """
    usage_events = set(analytics.usage_df["event_id"])
    surgery_df = analytics.surgery_df[analytics.surgery_df["event_id"].isin(
        usage_events)]
    surgery_df = Analytics.process_filters(surgery_df,
                                           filters=case_service_filter)
    surgery_df["procedures"] = surgery_df["procedures"].apply(
        lambda x: set(e.replace(" ", "_") for e in x))
    procedures = surgery_df["procedures"].apply(lambda x: list(x)).to_list()
    procedures = pd \
        .DataFrame({"procedure": [val for sublist in procedures for val in sublist],
                    "count": [1 for sublist in procedures for val in sublist]}) \
        .groupby("procedure") \
        .agg({"count": "count"}) \
        .reset_index()

    procedures["p"] = procedures["count"] / sum(procedures["count"])

    def procedure_pick_rv(size):
        return np.random.choice(procedures["procedure"],
                                p=procedures["p"],
                                replace=False,
                                size=size)

    synthetic_surgeries = pd.DataFrame({"event_id": list(range(1000))})
    synthetic_surgeries["procedure_count"] = procedure_count_rv.rand(1000)
    synthetic_surgeries["procedures"] = synthetic_surgeries[
        "procedure_count"].apply(lambda x: procedure_pick_rv(x))

    synthetic_procedure_df = pd.concat(
        [pd.Series(row['event_id'], row['procedures']) for _, row in synthetic_surgeries.iterrows()]) \
        .reset_index() \
        .rename(columns={"index": "procedure",
                         0: "event_id"}
                )
    synthetic_procedure_df["flag"] = 1
    synthetic_surgeries_df = synthetic_procedure_df \
        .pivot(index="event_id", columns="procedure", values="flag") \
        .fillna(0) \
        .reset_index()

    feature_df = pd.read_csv(os.path.join("regression_results", item_id))
    features = feature_df["feature"]
    featured_procedures = list(
        filter(lambda x: "." not in x, feature_df["feature"]))
    if "other" in featured_procedures:
        featured_procedures.remove("other")
    for fp in featured_procedures:
        if fp not in synthetic_surgeries_df:
            print(procedures.set_index("procedure").loc[fp])
            synthetic_surgeries_df[fp] = 0

    all_procedures = set.union(*surgery_df["procedures"])

    interactions = list(filter(lambda x: "." in x, feature_df["feature"]))
    interactions = list(Interaction(i.split(".")) for i in interactions)
    data, _ = SURegressionModel.extract_features_data(synthetic_surgeries_df,
                                                      featured_procedures, [],
                                                      interactions,
                                                      other=True)

    for f in feature_df["feature"]:
        if f not in data:
            print(f)
            data[f] = 0
    synthetic_surgeries_df["feature_vector"] = data[features].values.tolist()
    coeff = np.array(feature_df["estimate"])
    synthetic_surgeries_df["expected_usage"] = synthetic_surgeries_df["feature_vector"] \
        .apply(lambda x: np.exp(np.dot(x, coeff)))
    """
    Information rv for empirical surgeries
    """
    surgery_df = surgery_df.drop_duplicates("event_id", keep="last")
    empirical_procedure_df = pd.concat(
        [pd.Series(row['event_id'], row['procedures']) for _, row in surgery_df.iterrows()]) \
        .reset_index() \
        .rename(columns={"index": "procedure",
                         0: "event_id"}
                )
    empirical_procedure_df["flag"] = 1
    empirical_surgeries_df = empirical_procedure_df \
        .pivot(index="event_id", columns="procedure", values="flag") \
        .fillna(0) \
        .reset_index()
    data, _ = SURegressionModel.extract_features_data(empirical_surgeries_df,
                                                      featured_procedures, [],
                                                      interactions,
                                                      other=True)
    empirical_surgeries_df["feature_vector"] = data[features].values.tolist()
    empirical_surgeries_df["expected_usage"] = empirical_surgeries_df["feature_vector"] \
        .apply(lambda x: np.exp(np.dot(x, coeff)))
    """
    Plotly histogram for per surgery info rv, empirical surgeries and synthetic using regression results 
    """
    s = 0
    e = int(
        max(max(empirical_surgeries_df["expected_usage"]),
            max(synthetic_surgeries_df["expected_usage"])) + 1)
    empirical_trace = go.Histogram(
        x=empirical_surgeries_df["expected_usage"],
        name='Empirical Surgery Info RV (mean={:0.2f})'.format(
            np.mean(empirical_surgeries_df["expected_usage"])),
        xbins=dict(start=s, end=e, size=info_granularity),
        histnorm='probability density',
        opacity=0.75)
    synthetic_trace = go.Histogram(
        x=synthetic_surgeries_df["expected_usage"],
        name='Synthetic Surgery Info RV (mean={:0.2f})'.format(
            np.mean(synthetic_surgeries_df["expected_usage"])),
        xbins=dict(start=s, end=e, size=info_granularity),
        histnorm='probability density',
        opacity=0.75)
    layout = go.Layout(title="Per Surgery Info R.V Item: {0}".format(item_id),
                       xaxis={'title': 'Info [Expected Usage]'},
                       yaxis={'title': 'Probability Density'})
    figure = go.Figure(data=[empirical_trace, synthetic_trace], layout=layout)
    plot(figure, filename="{0}_Per_Surgery_Info_Rv.html".format(item_id))
    """
    Plotly histogram for per weekday elective surgery RV
    """
    empirical_rv_df = empirical_surgeries_df.groupby(["expected_usage"]) \
        .agg({"event_id": "count"}) \
        .rename(columns={"event_id": "count"}) \
        .reset_index()
    empirical_rv_df["p"] = empirical_rv_df["count"] / sum(
        empirical_rv_df["count"])
    emp_surgery_rv = pacal.DiscreteDistr(empirical_rv_df["expected_usage"],
                                         empirical_rv_df["p"])
    surgery_demand_rv = pacal.BinomialDistr(n, p)
    days = 100000
    elective_samples = [
        sum(emp_surgery_rv.rand(x)) for x in np.random.binomial(n, p, days)
    ]
    elective_samples = [
        round(sample / info_granularity) * info_granularity
        for sample in elective_samples
    ]
    weekday_elective_trace = go.Histogram(
        x=elective_samples,
        name='{} Elective Info RV (mean={:0.2f})'.format(
            item_id, np.mean(elective_samples)),
        xbins=dict(start=0, end=max(elective_samples), size=info_granularity),
        histnorm='probability',
        opacity=0.75)
    """
    Plotly histogram for per day emergency surgery RV
    """
    emergency_samples = [
        sum(emp_surgery_rv.rand(x))
        for x in np.random.poisson(emergency_surgeries_mean, days)
    ]
    emergency_samples = [
        round(sample / info_granularity) * info_granularity
        for sample in emergency_samples
    ]
    emergency_trace = go.Histogram(
        x=emergency_samples,
        name='{} Emergency Info RV (mean={:0.2f})'.format(
            item_id, np.mean(emergency_samples)),
        xbins=dict(start=0, end=max(emergency_samples), size=info_granularity),
        histnorm='probability',
        opacity=0.75)
    layout = go.Layout(
        title="Weekday Elective Info R.V Item: {0}".format(item_id),
        xaxis={'title': 'Info State (Poisson Usage)]'},
        yaxis={'title': 'Probability'})
    figure = go.Figure(data=[weekday_elective_trace, emergency_trace],
                       layout=layout)
    plot(figure, filename="{0}_Weekday_Elective_Info_Rv.html".format(item_id))

    elective_info_df = pd.DataFrame({"info": elective_samples, "count": [1] * len(elective_samples)}) \
        .groupby(["info"]) \
        .agg({"count": "count"}) \
        .reset_index()
    elective_info_df["p"] = elective_info_df["count"] / sum(
        elective_info_df["count"])
    elective_info_rv = pacal.DiscreteDistr(elective_info_df["info"],
                                           elective_info_df["p"])

    emergency_info_df = pd.DataFrame({"info": emergency_samples, "count": [1] * len(emergency_samples)}) \
        .groupby(["info"]) \
        .agg({"count": "count"}) \
        .reset_index()
    emergency_info_df["p"] = emergency_info_df["count"] / sum(
        emergency_info_df["count"])
    emergency_info_rv = pacal.DiscreteDistr(emergency_info_df["info"],
                                            emergency_info_df["p"])

    max_v = 999
    for d in elective_info_rv.get_piecewise_pdf().getDiracs():
        if 1 - elective_info_rv.cdf(d.a) < eps_trunk:
            max_v = d.a
            break
    diracs = (pacal.CondLtDistr(elective_info_rv, max_v)) \
        .get_piecewise_pdf().getDiracs()
    diracs = list(filter(lambda d: d.f > 0, diracs))
    elective_info_rv = pacal.DiscreteDistr([d.a for d in diracs],
                                           [d.f for d in diracs])

    max_v = 999
    for d in emergency_info_rv.get_piecewise_pdf().getDiracs():
        if 1 - emergency_info_rv.cdf(d.a) < eps_trunk:
            max_v = d.a
            break
    diracs = (pacal.CondLtDistr(emergency_info_rv, max_v)) \
        .get_piecewise_pdf().getDiracs()
    diracs = list(filter(lambda d: d.f > 0, diracs))
    emergency_info_rv = pacal.DiscreteDistr([d.a for d in diracs],
                                            [d.f for d in diracs])

    with open(os.path.join(elective_outdir, "{0}.pickle".format(item_id)),
              "wb") as f:
        pickle.dump(elective_info_rv, f)

    with open(os.path.join(emergency_outdir, "{0}.pickle".format(item_id)),
              "wb") as f:
        pickle.dump(emergency_info_rv, f)

    return emergency_trace, weekday_elective_trace
コード例 #5
0
def test_usage_r_regression_flow():
    from scm_analytics.model.SurgeryUsageRegressionModel import Interaction
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.options.mode.chained_assignment = None

    case_service = "Cardiac Surgery"
    item_id = "38242"
    pthres = 0.05
    occ_thres = 5

    analytics = ScmAnalytics.ScmAnalytics(lhs_config)
    surgery_df = analytics.surgery_df
    usage_df = analytics.usage_df
    item_ids = ["38242", "129636"]

    surgery_df = surgery_df[surgery_df["case_service"] == case_service]
    usage_df = usage_df[usage_df["case_service"] == case_service]
    surgery_df = surgery_df[surgery_df["event_id"].isin(
        set(usage_df["event_id"]))]
    surgery_df["procedures"] = surgery_df["procedures"].apply(
        lambda x: set(e.replace(" ", "_") for e in x))

    all_procedures = set.union(*surgery_df["procedures"])
    r_df = SURegressionModel.surgery_usage_regression_df(surgery_df,
                                                         usage_df,
                                                         item_ids=item_ids)

    interactions = list([
        Interaction((p1, p2))
        for p1, p2 in combinations(sorted(list(all_procedures)), 2)
    ])
    features = sorted(list(all_procedures))

    data, feature_df = SURegressionModel.extract_features_data(
        r_df,
        features,
        all_procedures,
        interactions,
        other=True,
        sum_others=False)
    print(feature_df)
    feature_df = feature_df[feature_df["occurrence"] >= occ_thres]
    interactions = list(
        filter(lambda x: str(x) in set(feature_df["feature"]), interactions))
    features = list(filter(lambda x: x in set(feature_df["feature"]),
                           features))

    while True:
        data, feature_df = SURegressionModel.extract_features_data(
            r_df,
            features,
            all_procedures,
            interactions,
            other=True,
            sum_others=False)
        data["y"] = list(r_df[item_id])
        feature_df, r2, _ = SURegressionModel.run_r_regression(
            data, feature_df, model="gaussian")
        print(feature_df)
        print("r2:", r2)
        thres_df = feature_df[feature_df["feature"].isin(features +
                                                         interactions)]
        if thres_df[thres_df["p.value"] > pthres].empty and thres_df[
                thres_df["occurrence"] < occ_thres].empty:
            break

        feature_df = feature_df[feature_df["p.value"] <= pthres]
        feature_df = feature_df[feature_df["occurrence"] >= occ_thres]
        interactions = list(
            filter(lambda x: str(x) in set(feature_df["feature"]),
                   interactions))
        features = list(
            filter(lambda x: x in set(feature_df["feature"]), features))

    feature_df = feature_df[["feature", "occurrence"]]
    feature_df, r2, _ = SURegressionModel.run_r_regression(data,
                                                           feature_df,
                                                           model="poisson")
    feature_df.to_csv(os.path.join("regression_results", item_id), index=False)
    data.to_csv(os.path.join("r_scripts", "test_data2.csv"), index=False)
    print(feature_df)
    print("r2:", r2)
コード例 #6
0
def run(case_service="Cardiac Surgery", item_id="1686", procedure_set=None):
    analytics = ScmAnalytics.ScmAnalytics(lhs_config)
    case_service_filter = [{
        "dim": "case_service",
        "op": "eq",
        "val": case_service
    }]

    usage_df = analytics.usage_df
    usage_df = usage_df[usage_df["start_date"].notna()]
    usage_df = Analytics.process_filters(usage_df, filters=case_service_filter)
    usage_events = set(usage_df["event_id"])
    item_usage_df = usage_df[usage_df["item_id"] == item_id]

    surgery_df = pre_process_columns(analytics.surgery_df)
    surgery_df = surgery_df[surgery_df["start_date"].notna()]
    surgery_df = surgery_df[
        surgery_df["start_date"] > datetime.date(2016, 1, 1)]
    surgery_df = Analytics.process_filters(surgery_df,
                                           filters=case_service_filter)
    surgery_df = surgery_df[surgery_df["event_id"].isin(usage_events)]

    surgery_df = surgery_df.join(
        item_usage_df.set_index("event_id")[["used_qty"]],
        on="event_id",
        how="left").fillna(0)
    surgery_df["procedures"] = surgery_df["procedures"].apply(
        lambda x: frozenset(x))
    surgery_df = surgery_df[surgery_df["procedures"] == procedure_set]

    traces = []

    x_max = int(max(surgery_df["used_qty"])) + 1

    data = surgery_df["used_qty"]
    label = ", ".join(procedure_set)
    fn = "__".join(procedure_set)
    fn = "Usage_Dist_item_" + item_id + "_" + fn.replace(" ", "_")
    #
    # traces.append(go.Histogram(
    #     x=data,
    #     name=label,
    #     xbins=dict(
    #         start=0,
    #         end=x_max,
    #         size=1
    #     ),
    #     histnorm='probability',
    #     opacity=1,
    #
    # ))
    #
    # tickvals = list(x + 0.5 for x in range(x_max))
    # ticktext = list(str(x) for x in range(x_max))
    # layout = go.Layout(  # title="Item: {} Empirical Usage Distribution for common cases".format(item_id),
    #     xaxis={'title': 'Used Qty',
    #            'tickvals': tickvals,
    #            'ticktext': ticktext},
    #     yaxis={'title': 'Probability'},
    #     font={"size": 16},
    #     plot_bgcolor="white",
    #     bargap=0.2)
    # figure = go.Figure(
    #     data=traces,
    #     layout=layout,
    # )
    # # figure.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')
    # figure.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')
    # # plot(figure, filename="{}_empircal_usage_distribution.html".format(item_id))
    # figure.write_image(fn, width=900, height=600)

    import matplotlib
    import matplotlib.ticker as plticker

    matplotlib.rcParams.update({'font.size': 12})
    plt.figure(figsize=(4, 3.5))
    plt.tight_layout()
    plt.gcf().subplots_adjust(bottom=0.15, left=0.15)

    n, bins, patches = plt.hist(data,
                                range(x_max + 1),
                                density=True,
                                facecolor='#08306b',
                                rwidth=0.95)

    spacing = np.round((max(n) + 0.1) / 4, decimals=1)
    plt.yticks(np.arange(0, max(n) + 0.1, spacing))
    #matplotlib.pyplot.grid(b=True, which='major', axis='y')
    plt.ylabel("Probability")
    plt.xlabel("Used Quantity")
    plt.xticks(range(x_max + 1))
    plt.savefig(fn + ".svg", format='svg')
    plt.savefig(fn + ".eps", format='eps')
コード例 #7
0
def test_usage_r_regression_flow(item_id=None, save_results=False):
    summary = {"item_id": item_id}

    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.options.mode.chained_assignment = None

    case_service = "Cardiac Surgery"
    item_id = item_id if item_id else "38242"
    pthres = 0.05
    occ_thres = 5
    tail_trim = 0.01

    analytics = ScmAnalytics.ScmAnalytics(lhs_config)
    surgery_df = analytics.surgery_df
    usage_df = analytics.usage_df
    item_ids = [item_id]

    surgery_df = surgery_df[surgery_df["case_service"] == case_service]
    usage_df = usage_df[usage_df["case_service"] == case_service]

    surgery_df = surgery_df[surgery_df["event_id"].isin(
        set(usage_df["event_id"]))]
    surgery_df["procedures"] = surgery_df["procedures"].apply(
        lambda x: set(e.replace(" ", "_") for e in x))

    all_procedures = set.union(*surgery_df["procedures"])
    r_df = SURegressionModel.surgery_usage_regression_df(surgery_df,
                                                         usage_df,
                                                         item_ids=item_ids)

    if tail_trim:
        usage_df = usage_df[usage_df["item_id"] == item_id]
        trim_index = int(len(r_df) * (1 - tail_trim))
        expected_usage = np.mean(r_df[item_id])
        max_usage = max(r_df[item_id])
        usage_prob = len(usage_df) / len(surgery_df)

        trim_thres = r_df.sort_values(by=[item_id])[item_id].iloc[trim_index]
        discard_ratio = len(
            usage_df[usage_df["used_qty"] > trim_thres]) / len(usage_df)
        usage_df = usage_df[usage_df["used_qty"] <= trim_thres]

        print("Usage Probability:", usage_prob)
        print("Mean Usage:", expected_usage)
        print("Max Usage:", max_usage)
        print("Trim Threshold:", trim_thres)
        print("Discard Ratio:", discard_ratio)
        summary["usage_p"] = usage_prob
        summary["mean_usage"] = expected_usage
        summary["max_usage"] = max_usage
        summary["trim_thres"] = trim_thres
        summary["discard_ratio"] = discard_ratio

    interactions = list([
        Interaction((p1, p2))
        for p1, p2 in combinations(sorted(list(all_procedures)), 2)
    ])
    features = sorted(list(all_procedures))

    data, feature_df = SURegressionModel.extract_features_data(
        r_df,
        features,
        all_procedures,
        interactions,
        other=True,
        sum_others=False)

    print(feature_df)
    feature_df = feature_df[feature_df["occurrence"] >= occ_thres]
    interactions = list(
        filter(lambda x: str(x) in set(feature_df["feature"]), interactions))
    features = list(filter(lambda x: x in set(feature_df["feature"]),
                           features))

    while True:
        data, feature_df = SURegressionModel.extract_features_data(
            r_df,
            features,
            all_procedures,
            interactions,
            other=True,
            sum_others=False)
        data["y"] = list(r_df[item_id])
        feature_df, r2, _ = SURegressionModel.run_r_regression(
            data, feature_df, model="gaussian")
        print(feature_df)
        print("r2:", r2)
        thres_df = feature_df[feature_df["feature"].isin(features +
                                                         interactions)]
        if thres_df[thres_df["p.value"] > pthres].empty and thres_df[
                thres_df["occurrence"] < occ_thres].empty:
            break

        feature_df = feature_df[feature_df["p.value"] <= pthres]
        feature_df = feature_df[feature_df["occurrence"] >= occ_thres]
        interactions = list(
            filter(lambda x: str(x) in set(feature_df["feature"]),
                   interactions))
        features = list(
            filter(lambda x: x in set(feature_df["feature"]), features))

    feature_df = feature_df[["feature", "occurrence"]]
    feature_df, r2, fitted_y = SURegressionModel.run_r_regression(
        data, feature_df, model="poisson")
    residuals = fitted_y - data["y"]
    constant_residuals = np.mean(data["y"]) - data["y"]
    feature_df.to_csv(os.path.join("regression_results", item_id), index=False)
    data.to_csv(os.path.join("r_scripts", "test_data2.csv"), index=False)
    print(feature_df)
    print("r2:", r2)
    summary["r2"] = r2
    step = 0.5
    s = np.floor(min(residuals)) - step / 2
    e = np.ceil(max(residuals)) + step / 2

    mu = np.mean(residuals)
    std = np.std(residuals, ddof=1)
    bins = np.arange(s, e, step)
    norm_x = np.arange(s, e, step / 10)
    weights = np.ones(len(residuals)) / len(residuals)

    traces = [
        go.Histogram(x=residuals,
                     name='Poisson Residuals (Fit - Empirical)',
                     xbins=dict(start=s, end=e, size=step),
                     histnorm='probability density',
                     opacity=0.75),
        go.Scatter(
            x=norm_x,
            y=stats.norm.pdf(norm_x, mu, std),
            mode='lines',
            name='Poisson Residuals mu={0:.5f}, sigma={1:.2f}'.format(mu, std),
        ),
        go.Histogram(x=constant_residuals,
                     name='Constant Model Residuals (Fit - Empirical)',
                     xbins=dict(start=s, end=e, size=step),
                     histnorm='probability density',
                     opacity=0.75),
        go.Scatter(
            x=norm_x,
            y=stats.norm.pdf(norm_x, 0, np.std(data["y"], ddof=1)),
            mode='lines',
            name='Constant Residuals, sigma={0:.2f}'.format(
                np.std(data["y"], ddof=1)),
        )
    ]
    layout = go.Layout(title="Residuals",
                       xaxis={'title': 'Residual'},
                       yaxis={'title': 'Probability Density'})
    figure = go.Figure(data=traces, layout=layout)
    plot(figure, filename="{0}_residuals_r2_{1:0.2f}.html".format(item_id, r2))

    plt.hist(residuals,
             bins=bins,
             density=True,
             rwidth=0.96,
             alpha=0.5,
             label="Residuals 'fit - empirical'")
    plt.plot(norm_x,
             stats.norm.pdf(norm_x, mu, std),
             label="mu={0:.5f}, sigma={1:.2f}".format(mu, std))
    plt.title("Residuals Histogram from Regression Model")
    plt.ylabel("Probability Density")
    plt.xlabel("Residual")
    plt.legend()
    if save_results:
        plt.savefig("{0}_surgery_item_usage_residuals_r2_{1:0.2f}.png".format(
            item_id, r2),
                    format="png")
    #plt.show()
    return summary