def findsimilarities(dataframe, checkvars, orig_dataframe): """Takes a dataframe (from identifyoutliers()) of outliers and a list of variables to be checked. """ cleanframe = cleanallerrs(dataframe) clean_orig = cleanallerrs(orig_dataframe) reslist = [] for var in checkvars: this_series = cleanframe[var] this_std = this_series.std() this_mean = this_series.mean() this_stdv = this_std / this_mean this_qr = this_series.quantile(.8) / this_series.mean() - \ this_series.quantile(.2) / this_series.mean() this_min = this_series.min() this_max = this_series.max() this_median = this_series.median() this_closetomedian = len( this_series[(this_series < this_median * 1.1) & (this_series > this_median * 0.9)]) this_meandif = this_mean - samplemean(clean_orig, var) this_absmeandif = abs(this_meandif) this_normabsmeandif = this_absmeandif / this_mean # Number of NaNs: this_nan = this_series.isnull().sum() this_len = len(this_series) this_nonnan = this_len - this_nan this_info = { 'name': var, 'mean': this_mean, 'std': this_std, 'normstd': this_stdv, 'percrange': this_qr, 'min': this_min, 'max': this_max, 'median': this_median, 'medianrange': this_closetomedian, 'meandif': this_meandif, 'absmeandif': this_absmeandif, 'normabsmeandif': this_normabsmeandif, 'NaNs': this_nan, 'nonNaNs': this_nonnan, 'size': this_len } reslist.append(this_info) # sort reslist outlist = sorted(reslist, key=itemgetter('normabsmeandif'), reverse=True) return outlist
def main(): H152a = DataSet('h152e.pkl') df = cleanallerrs(H152a.df) print df.ERCCC1X.value_counts() print df.ERCCC2X.value_counts() print df.ERCCC3X.value_counts()
def main(): H152a = DataSet('h152e.pkl') df = cleanallerrs(H152a.df) print df.ERCCC1X.value_counts() print df.ERCCC2X.value_counts() print df.ERCCC3X.value_counts() colc = df[df['ERCCC1X'] == 14] colc.ERTC12X.plot() print len(colc) plt.show()
def main(): H155 = DataSet('h155.pkl') df = cleanallerrs(H155.df) df = df[df['CHOLDX'] == 1] vargraph(df, 'ARTHDX', 'TOTEXP12', categorical=True, catlabels=getattr(H155, 'IPNGTD12').responses, log=True, condition=False)
df['LOGTOTEXP'].dropna() return df def addcategcopy(df, varlist): categlist = [] for varb in varlist: if len(df[varb].value_counts()) < 10: # Make categorical copy categlist.append('C(%s)' % varb) varlist.extend(categlist) return varlist H155 = DataSet('h155.pkl') df = addcustomvars(cleanallerrs(H155.df)) df = df[df.CHOLDX == 1] # print getattr(H155, 'CHOLDX').responses dismissed = [] # Variable, coefficient, p-value removed from "good" list # Set Custom Variables # # Model Parameters # dependent = 'LOGTOTEXP' independents = [ 'C(ACTLIM31)', 'C(AIDHLP31)', 'AGE12X', 'C(ARTHDX)', 'C(BADHLTH)', 'BMINDX53', 'C(CANCERDX)', 'C(COGLIM31)', 'C(DIABDX)', 'C(INS12X)', 'C(WRGLAS42)' ]
def main(): H155 = DataSet('h155.pkl') df = cleanallerrs(H155.df) print guessrelationship(df, 'BMINDX53', 'TOTEXP12')
'C(RESPECT)', 'C(ENUFTIME)', 'C(FEWDENTCHK)', 'C(NOPHONE)', 'C(NOAFTERHRS)', 'C(NOFLUSHT)' ] all_prov_mod = prov_mod + [ 'C(FEWCHECK)', 'C(PSAYR)', 'C(PAPYR)', 'C(BRSTYR)', 'C(MAMMOYR)', 'C(STOOLYR)', 'C(COLONOSYR)', 'C(SIGMOIDYR)', 'C(SEATBELT)', 'C(HARDTOGET)', 'C(PROBLEMDNT)', 'C(NOLISTEN)', 'C(LITTLECARE)', 'C(FEWAPPT)', 'C(NOTEASY)', 'C(NOEXPLAIN)', 'C(NODRRESPCT)', 'C(NOINSTRUC)', 'C(NOUNDERST)', 'C(NODRDESC)', 'C(NOFORMHELP)', 'C(NOEZREF)', 'C(NOASKTREAT)', 'C(NOEXPLOPT)' ] h155 = DataSet('h155.pkl') df = h155.df df = cleanallerrs(df) #modified model vars df['BADHLTH'] = df['RTHLTH31'] >= 4 #core healthcare quality/preventative care vars df['PROBLEM'] = df['MDUNPR42'] == 1 df['CHOLCKYR'] = df['CHOLCK53'] == 1 df['BPCHKYR'] = df['BPCHEK53'] == 1 df['GOODHC'] = df['ADHECR42'] >= 8 df['NODECIDE'] = df['DECIDE42'] == 1 df['RESPECT'] = df['RESPCT42'] == 4 df['ENUFTIME'] = df['ADPRTM42'] >= 3 df['FEWDENTCHK'] = df['DENTCK53'] >= 3 df['NOPHONE'] = df['PHNREG42'] <= 2 df['NOAFTERHRS'] = df['AFTHOU42'] <= 2
def main(): H152a = DataSet('h152a.pkl') df = cleanallerrs(H152a.df) print df.RXNAME.value_counts()