Beispiel #1
0
def findsimilarities(dataframe, checkvars, orig_dataframe):
    """Takes a dataframe (from identifyoutliers()) of outliers and a list of
    variables to be checked.
    """
    cleanframe = cleanallerrs(dataframe)
    clean_orig = cleanallerrs(orig_dataframe)
    reslist = []
    for var in checkvars:
        this_series = cleanframe[var]
        this_std = this_series.std()
        this_mean = this_series.mean()
        this_stdv = this_std / this_mean
        this_qr = this_series.quantile(.8) / this_series.mean() - \
            this_series.quantile(.2) / this_series.mean()
        this_min = this_series.min()
        this_max = this_series.max()
        this_median = this_series.median()
        this_closetomedian = len(
            this_series[(this_series < this_median * 1.1)
                        & (this_series > this_median * 0.9)])
        this_meandif = this_mean - samplemean(clean_orig, var)
        this_absmeandif = abs(this_meandif)
        this_normabsmeandif = this_absmeandif / this_mean
        # Number of NaNs:
        this_nan = this_series.isnull().sum()
        this_len = len(this_series)
        this_nonnan = this_len - this_nan

        this_info = {
            'name': var,
            'mean': this_mean,
            'std': this_std,
            'normstd': this_stdv,
            'percrange': this_qr,
            'min': this_min,
            'max': this_max,
            'median': this_median,
            'medianrange': this_closetomedian,
            'meandif': this_meandif,
            'absmeandif': this_absmeandif,
            'normabsmeandif': this_normabsmeandif,
            'NaNs': this_nan,
            'nonNaNs': this_nonnan,
            'size': this_len
        }
        reslist.append(this_info)

    # sort reslist
    outlist = sorted(reslist, key=itemgetter('normabsmeandif'), reverse=True)
    return outlist
Beispiel #2
0
def main():
    H152a = DataSet('h152e.pkl')
    df = cleanallerrs(H152a.df)

    print df.ERCCC1X.value_counts()
    print df.ERCCC2X.value_counts()
    print df.ERCCC3X.value_counts()
Beispiel #3
0
def main():
    H152a = DataSet('h152e.pkl')
    df = cleanallerrs(H152a.df)

    print df.ERCCC1X.value_counts()
    print df.ERCCC2X.value_counts()
    print df.ERCCC3X.value_counts()
    colc = df[df['ERCCC1X'] == 14]
    colc.ERTC12X.plot()
    print len(colc)
    plt.show()
Beispiel #4
0
def main():
    H155 = DataSet('h155.pkl')
    df = cleanallerrs(H155.df)
    df = df[df['CHOLDX'] == 1]
    vargraph(df,
             'ARTHDX',
             'TOTEXP12',
             categorical=True,
             catlabels=getattr(H155, 'IPNGTD12').responses,
             log=True,
             condition=False)
Beispiel #5
0
    df['LOGTOTEXP'].dropna()
    return df


def addcategcopy(df, varlist):
    categlist = []
    for varb in varlist:
        if len(df[varb].value_counts()) < 10:
            # Make categorical copy
            categlist.append('C(%s)' % varb)
    varlist.extend(categlist)
    return varlist


H155 = DataSet('h155.pkl')
df = addcustomvars(cleanallerrs(H155.df))
df = df[df.CHOLDX == 1]

# print getattr(H155, 'CHOLDX').responses

dismissed = []  # Variable, coefficient, p-value removed from "good" list

# Set Custom Variables #

# Model Parameters #
dependent = 'LOGTOTEXP'
independents = [
    'C(ACTLIM31)', 'C(AIDHLP31)', 'AGE12X', 'C(ARTHDX)', 'C(BADHLTH)',
    'BMINDX53', 'C(CANCERDX)', 'C(COGLIM31)', 'C(DIABDX)', 'C(INS12X)',
    'C(WRGLAS42)'
]
def main():
    H155 = DataSet('h155.pkl')
    df = cleanallerrs(H155.df)
    print guessrelationship(df, 'BMINDX53', 'TOTEXP12')
Beispiel #7
0
    'C(RESPECT)', 'C(ENUFTIME)', 'C(FEWDENTCHK)', 'C(NOPHONE)',
    'C(NOAFTERHRS)', 'C(NOFLUSHT)'
]

all_prov_mod = prov_mod + [
    'C(FEWCHECK)', 'C(PSAYR)', 'C(PAPYR)', 'C(BRSTYR)', 'C(MAMMOYR)',
    'C(STOOLYR)', 'C(COLONOSYR)', 'C(SIGMOIDYR)', 'C(SEATBELT)',
    'C(HARDTOGET)', 'C(PROBLEMDNT)', 'C(NOLISTEN)', 'C(LITTLECARE)',
    'C(FEWAPPT)', 'C(NOTEASY)', 'C(NOEXPLAIN)', 'C(NODRRESPCT)',
    'C(NOINSTRUC)', 'C(NOUNDERST)', 'C(NODRDESC)', 'C(NOFORMHELP)',
    'C(NOEZREF)', 'C(NOASKTREAT)', 'C(NOEXPLOPT)'
]

h155 = DataSet('h155.pkl')
df = h155.df
df = cleanallerrs(df)

#modified model vars
df['BADHLTH'] = df['RTHLTH31'] >= 4

#core healthcare quality/preventative care vars
df['PROBLEM'] = df['MDUNPR42'] == 1
df['CHOLCKYR'] = df['CHOLCK53'] == 1
df['BPCHKYR'] = df['BPCHEK53'] == 1
df['GOODHC'] = df['ADHECR42'] >= 8
df['NODECIDE'] = df['DECIDE42'] == 1
df['RESPECT'] = df['RESPCT42'] == 4
df['ENUFTIME'] = df['ADPRTM42'] >= 3
df['FEWDENTCHK'] = df['DENTCK53'] >= 3
df['NOPHONE'] = df['PHNREG42'] <= 2
df['NOAFTERHRS'] = df['AFTHOU42'] <= 2
Beispiel #8
0
def main():
    H152a = DataSet('h152a.pkl')
    df = cleanallerrs(H152a.df)

    print df.RXNAME.value_counts()