def get_df(datadir, fname, key): fname = get_fname(fname) df = xport.to_dataframe(open(datadir / fname, 'rb')) df.set_index(key, inplace=True) df.drop(df.columns.difference(all_cols), axis=1, inplace=True) df.rename(columns={**input_col_map, **output_col_map}, inplace=True) return df
def merge_xpt(fname): if type(fname) == list: df = [] for f in fname: with open(f, 'rb') as file: df.append(xport.to_dataframe(file)) all_files = np.array(df) # store all datasets in a np.array # merge data.frames for i in range(len(fname) - 1): df[0] = df[0].merge(df[i + 1], on=['SEQN']) return all_files, df[0] with open(fname, 'rb') as file: df = xport.to_dataframe(file) return df
def get_survey_data(year): components = [ 'Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire', 'LimitedAccess' ] for component in components: survey_urls = re.findall( rf'href=\"(.*Component={component}.*CycleBeginYear={year})', html) for survey_url in survey_urls: survey_url = survey_url.replace('&', '&') survey_url = f'https://wwwn.cdc.gov{survey_url}' survey_page = requests.get(survey_url).text data_urls = re.findall(rf'href=\"(.*XPT|.*xpt)\"', survey_page) for data_url in data_urls: data_url = f'https://wwwn.cdc.gov{data_url}' r = requests.get(data_url) xpt_filename = os.path.basename(data_url) filepath = os.path.join(DATA_PATH, f'{year}-{year+1}', component) pathlib.Path(filepath).mkdir(parents=True, exist_ok=True) xpt_filepath = os.path.join(filepath, xpt_filename) with open(xpt_filepath, 'wb') as f: f.write(r.content) with open(xpt_filepath, 'rb') as f: data = xport.to_dataframe(f) if xpt_filename.endswith('.XPT'): csv_filename = xpt_filename.replace('.XPT', '.csv') elif xpt_filename.endswith('.xpt'): csv_filename = xpt_filename.replace('.xpt', '.csv') csv_filepath = os.path.join(filepath, csv_filename) print(f'Downloading to {csv_filepath}') data.to_csv(csv_filepath, index=False) os.remove(xpt_filepath)
def process_DRIFF(day_num, year): # """ # DR*IFF_H is an ind_files # note: drops DR1CCMNM DR1CCMTX DR1_020 DR1_030Z DR1FS DR1_040Z # #TODO: include time of day, quantize into ~3-5 cats # End cols: # food1, food2, food3 # """ #setup some var names based on day DRIFF = 'DR{}IFF_{}'.format(day_num, year2letter[year]) DRIFDCD = 'DR{}IFDCD'.format(day_num) DRIKCAL = 'DR{}IKCAL'.format(day_num) DR_020 = 'DR{}_020'.format(day_num) DRILINE = 'DR{}ILINE'.format(day_num) DRCCMNM = 'DR{}CCMNM'.format(day_num) WTDRD = 'WTDRD1' DRIGRMS = 'DR{}IGRMS'.format(day_num) fn = DRIFF file_path = os.path.join(data_dir.format(year), "{}.XPT".format(fn)) with open(file_path, 'rb') as f: dfm = xport.to_dataframe(f) df = dfm df['SEQN'] = dfm['SEQN'].astype(int) df[DRIFDCD] = dfm[DRIFDCD].astype(int) #drop redundant data df = dropAllBut(dfm, ['SEQN', DRIFDCD]) #make core out of just seqn core = df['SEQN'].drop_duplicates(keep='first').to_frame() num_seqs = core.shape[0] # determine food encodings food_codes = np.sort(df[DRIFDCD].unique()).astype(np.int) fc_label_format = DRIFDCD + '-{}' fc_cols = [fc_label_format.format(fc) for fc in food_codes] num_fc = food_codes.size # #expand core by food code labels dfadd = pd.DataFrame(np.zeros((num_seqs, num_fc)).astype(np.float), columns=fc_cols) core = core.reset_index(drop=True) core = pd.concat([core, dfadd], axis=1) # #add the ind.food entries to core for seqn in core['SEQN']: dfs = df.loc[df['SEQN'] == seqn] for i in dfs.index: #set appropriate value in core.o.. fc = dfs.at[i, DRIFDCD] core.loc[core.index[core['SEQN'] == seqn], fc_label_format.format(fc)] = 1 #final readifiction core_seqn = list(core.set_index("SEQN").index) core = fillMissingSeqn(core, year) return core, core_seqn
def xpt_to_csv(filename, filepath, save_dir): path=os.path.join(filepath+'/'+filename) with open(path, 'rb') as f: df=xport.to_dataframe(f) savepath=os.path.join(filepath+'/'+filename.split('.')[0]+'.csv') df.to_csv(savepath)
def process_DSTOT_H(day_num): DSTOT_H = 'DS{}TOT_H'.format(day_num) WTDRD = 'WTDRD1' DSTKCAL = 'DS{}TKCAL'.format(day_num) fn = DSTOT_H file_path = os.path.join(data_dir, "{}.XPT".format(fn)) with open(file_path, 'rb') as f: dfm = xport.to_dataframe(f) df = dfm df = dropBetween(df, WTDRD, DSTKCAL) df = fillMissingSeqnAndDropSeqn(df) return df
def process_DSIDS(day_num, year): """ DR*IDS_H is an ind_files note: drops DR1CCMNM DR1CCMTX DR1_020 DR1_030Z DR1FS DR1_040Z """ #setup some var names based on day DSIDS = 'DS{}IDS_{}'.format(day_num, year2letter[year]) DSDSUPP = 'DSDSUPP' WTDRD = 'WTDRD1' fn = DSIDS file_path = os.path.join(data_dir.format(year), "{}.XPT".format(fn)) with open(file_path, 'rb') as f: dfm = xport.to_dataframe(f) #convert columns of interest to int df = dfm df['SEQN'] = dfm['SEQN'].astype(int) df['DSDSUPID'] = dfm['DSDSUPID'].astype(int) #drop redundant data df = dropAllBut(df, ['SEQN', 'DSDSUPID']) #make core out of just seqn core = df['SEQN'].drop_duplicates(keep='first').to_frame() num_seqs = core.shape[0] # determine sppl encodings sppl_codes = np.sort(df.DSDSUPID.unique()).astype(np.int) sc_label_format = 'DSDSUPID_2D_{}-{}' sc_cols = [sc_label_format.format(day_num, sc) for sc in sppl_codes] num_sc = sppl_codes.size # #expand core by food code labels dfadd = pd.DataFrame(np.zeros((num_seqs, num_sc)).astype(np.float), columns=sc_cols) core = core.reset_index(drop=True) core = pd.concat([core, dfadd], axis=1) # #add the ind.food entries to core for seqn in core['SEQN']: dfs = df.loc[df['SEQN'] == seqn] for i in dfs.index: #set appropriate value in core.o.. sc = dfs.at[i, 'DSDSUPID'] core.loc[core.index[core['SEQN'] == seqn], sc_label_format.format(day_num, sc)] = 1 core_seqn = list(core.set_index("SEQN").index) core = fillMissingSeqn(core, year) return core, core_seqn
def process_DSTOT(day_num, year): DSTOT = 'DS{}TOT_{}'.format(day_num, year2letter[year]) WTDRD = 'WTDRD1' DSTKCAL = 'DS{}TKCAL'.format(day_num) fn = DSTOT file_path = os.path.join(data_dir.format(year), "{}.XPT".format(fn)) with open(file_path, 'rb') as f: dfm = xport.to_dataframe(f) df = dfm df = dropBetween(df, WTDRD, DSTKCAL) df_seqn = list(df.set_index("SEQN").index) df = fillMissingSeqn(df, year) return df, df_seqn
def process_DSQTOT_H(): #setup some var names based on day DSQTOT_H = 'DSQTOT_H' fn = DSQTOT_H file_path = os.path.join(data_dir, "{}.XPT".format(fn)) with open(file_path, 'rb') as f: dfm = xport.to_dataframe(f) df = dfm #convert columns of interest to int df = dfm df['SEQN'] = df['SEQN'].astype(int) df = dropBetween(df, 'DSDCOUNT', 'DSQTKCAL') df = fillMissingSeqnAndDropSeqn(df) return df
def process_DSQIDS_H(): #setup some var names based on day DSQIDS_H = 'DSQIDS_H' DSDSUPP = 'DSDSUPP' WTDRD = 'WTDRD1' fn = DSQIDS_H file_path = os.path.join(data_dir, "{}.XPT".format(fn)) with open(file_path, 'rb') as f: dfm = xport.to_dataframe(f) #convert columns of interest to int df = dfm df['SEQN'] = dfm['SEQN'].astype(int) df['DSDSUPID'] = dfm['DSDSUPID'].astype(int) #drop redundant data df = dropAllBut(df, ['SEQN', 'DSDSUPID']) #make core out of just seqn core = df['SEQN'].drop_duplicates(keep='first').to_frame() num_seqs = core.shape[0] # determine sppl encodings sppl_codes = np.sort(df.DSDSUPID.unique()).astype(np.int) sc_label_format = 'DSDSUPID_30D-{}' sc_cols = [sc_label_format.format(sc) for sc in sppl_codes] num_sc = sppl_codes.size # #expand core by food code labels dfadd = pd.DataFrame(np.zeros((num_seqs, num_sc)).astype(np.float), columns=sc_cols) core = core.reset_index(drop=True) core = pd.concat([core, dfadd], axis=1) # #add the ind.food entries to core for seqn in core['SEQN']: dfs = df.loc[df['SEQN'] == seqn] for i in dfs.index: #set appropriate value in core.o.. sc = dfs.at[i, 'DSDSUPID'] core.loc[core.index[core['SEQN'] == seqn], sc_label_format.format(sc)] = 1 core = fillMissingSeqnAndDropSeqn(core) return core
def process_DSQTOT(year): #setup some var names based on day DSQTOT = 'DSQTOT_{}'.format(year2letter[year]) fn = DSQTOT file_path = os.path.join(data_dir.format(year), "{}.XPT".format(fn)) with open(file_path, 'rb') as f: dfm = xport.to_dataframe(f) df = dfm #convert columns of interest to int df = dfm df['SEQN'] = df['SEQN'].astype(int) df = dropBetween(df, 'DSDCOUNT', 'DSQTKCAL') df_seqn = list(df.set_index("SEQN").index) df = fillMissingSeqn(df, year) return df, df_seqn
def xptodf(i): with open(i, 'rb') as f: return xp.to_dataframe(f)
# -*- coding: utf-8 -*- """ Created on Tue Dec 11 17:03:48 2018 @author: qiang """ import numpy as np import xport import pandas as pd # import the dataset path = r"C:\Users\qiang\Desktop\2018 fall\5825\homework\project\LLCP2017.XPT" with open(path, 'rb') as f: df = xport.to_dataframe(f) # select BMI5 from dataset data_BMI = df['_BMI5'] #fill the NaN data with mean newdata_BMI = data_BMI.fillna(int(data_BMI.mean())) # select WTKG3 from dataset data_WTK = df['WTKG3'] #fill the NaN data with mean newdata_WTK = data_WTK.fillna(int(data_WTK.mean())) # create new dataset dataset = pd.DataFrame() dataset['WTKG'] = newdata_WTK # add WTK # create class label according BMI
#80: --------------------------------------------------------------------------- # Set up: ---------------------------------------------------------------------- import xport import pandas as pd import numpy as np import seaborn as sns import random import matplotlib.pyplot as plt from scipy.stats import t from statsmodels.formula.api import ols # Read in the data: ------------------------------------------------------------ ## 'rb' mode - opens the file in binary format for reading with open('HSQ_D.XPT', 'rb') as f: df_health = xport.to_dataframe(f) with open('ALQ_D.XPT', 'rb') as f: df_alcohol = xport.to_dataframe(f) with open('DEMO_D.XPT', 'rb') as f: df_demo = xport.to_dataframe(f) # Data preparation: ------------------------------------------------------------ # Extract key columns df_health = df_health.loc[df_health['HSD010'] <= 3, ['SEQN', 'HSD010']] df_alcohol = df_alcohol.loc[df_alcohol['ALQ120Q'] <= 365, ['SEQN', 'ALQ120Q']] df_demo = df_demo.loc[(df_demo.RIDAGEYR >= 21) & (df_demo.DMDEDUC2 <= 5), ['SEQN', 'RIAGENDR', 'RIDAGEYR', 'INDFMPIR', 'DMDEDUC2']] # Merge key columns into one data frame
def test_to_dataframe(self, library, library_bytestring): ds = next(iter(library.values())) fp = BytesIO(library_bytestring) with pytest.warns(DeprecationWarning): result = xport.to_dataframe(fp) assert (result == ds).all(axis=None)