Example #1
0
def createFeatures():
    """
    This is just a sample script function showing the functionality of 
    widen_on_fields
    avg_and_max_fields
    """
    data   =  clean.readH5Store("HHP_release3.h5")
    claim  = data["claim"]
    drug   = data["drug"]
    lab    = data["lab"]
    member = data["member"]
    dih    = data["dih"]
    dih    = dih.join(pd.DataFrame({"Year":np.repeat("Y1",len(dih.index))}))
    dih_3  = data["dih_y3"]
    dih_3  = dih_3.join(pd.DataFrame({"Year":np.repeat("Y2",len(dih_3.index))}))
    days_in_hospital   = dih.append(dih_3)
    days_in_hospital = days_in_hospital.set_index(["MemberID","Year"])
    days_in_hospital.columns =  ["NextYearTruncated","Target"]
    drug   = member.merge(drug,on="MemberID")
    drug_lab = drug.merge(lab,how="outer",on=["MemberID","Year","DSFS"])
    drug_start = datetime.datetime.now()
    drug_lab_count = count_drug_lab(drug_lab)# now indexed by MemberID and Year
    drug_end   = datetime.datetime.now()
    logging.debug("drug_done.  IT took %d seconds" % (drug_end - drug_start).seconds)
    claim_counting_fields = [["Specialty","PlaceSvc"],"LengthOfStay", "PrimaryConditionGroup", "CharlsonIndex","ProcedureGroup"]
    widen_start = datetime.datetime.now()
    claims_counted = widen_on_fields(claim,claim_counting_fields)
    widen_end = datetime.datetime.now()
    logging.debug("widen_done.  it took %d seconds"% (widen_end - widen_start).seconds)
    avg_fields = ["PayDelay"]
    avg_start = datetime.datetime.now()
    avg_frame = avg_and_max_fields(claim,avg_fields)
    avg_end = datetime.datetime.now()
    logging.debug("avg_done.  it took %d seconds" % (avg_end - avg_start).seconds)
    features_frame = drug_lab_count.join(claims_counted,how="outer").join(avg_frame,how="outer").join(days_in_hospital,how="outer")
    data["features"] = features_frame
    clean.storeAsH5("HHP_release3.h5",data)
Example #2
0
File: test.py Project: szs8/kaggle
import numpy as np
import clean

def widen_on_fields(data,fields_counted):
    """
    This function takes in a dataFrame and list of fields.
    It iterates over fields and returns count each possible field occured per year
    DataFrame returned is MultiIndex on MemberID and Year
    """
    rows = [data.MemberID, data.Year]
    res = None
    for field in fields_counted:
        isList = 1 if isinstance(field, (list, tuple, np.ndarray)) else 0
        cols = [data.ix[:, i] for i in field] if isList else data.ix[:, field]
        df = pd.crosstab(rows=rows, cols=cols)
        key = "_".join(field) if isList else field
        df.columns = [key + '_' + str(i) for i in df.columns]
        res = df if res is None else res.join(df, how="outer")
    return res

data = clean.readH5Store("HHP_release3.h5")
claim  = data["claim"]
df = widen_on_fields(claim, ["Specialty", "PlaceSvc","LengthOfStay",
                             "PrimaryConditionGroup", "CharlsonIndex",
                             "ProcedureGroup", ["Specialty", "PlaceSvc"]])

print len(df), "rows"
print df.columns

#print df[:5]
Example #3
0
def widen_on_fields(data, fields_counted):
    """
    This function takes in a dataFrame and list of fields.
    It iterates over fields and returns count each possible field occured per year
    DataFrame returned is MultiIndex on MemberID and Year
    """
    rows = [data.MemberID, data.Year]
    res = None
    for field in fields_counted:
        isList = 1 if isinstance(field, (list, tuple, np.ndarray)) else 0
        cols = [data.ix[:, i] for i in field] if isList else data.ix[:, field]
        df = pd.crosstab(rows=rows, cols=cols)
        key = "_".join(field) if isList else field
        df.columns = [key + '_' + str(i) for i in df.columns]
        res = df if res is None else res.join(df, how="outer")
    return res


data = clean.readH5Store("HHP_release3.h5")
claim = data["claim"]
df = widen_on_fields(claim, [
    "Specialty", "PlaceSvc", "LengthOfStay", "PrimaryConditionGroup",
    "CharlsonIndex", "ProcedureGroup", ["Specialty", "PlaceSvc"]
])

print len(df), "rows"
print df.columns

#print df[:5]