Ejemplo n.º 1
0
store_space = config['dirs']['store']['ksads']

# connect to Box
box = LifespanBox(cache=ksads_cache_path, config_file=config['box'])
redcap = Redcap(config['redcap']['config'])
assessments = config['Assessments']
sites = config['Sites']
# verbose = False

# snapshot folder (used to be the combined folder)
ksads_snapshotfolderid = config['ksads_snapshotfolderid']
snapshotQCfolder = config['snapshotQCfolder']

# download one of the identical key files which contain the labels for
# all the numbered questions in KSADS
cachekeyfile = box.downloadFile(config['cachekeyfile'])


def main():
    for item in assessments:
        # Download latest clean (box) files for each site - make sure they exist.  send up warning if they dont -
        # Before running this program, Montly update process is to download data (3 files - intro,screener,supplement,
        # for each site) from KSADS.net, append rows to site/assessment file (by hand until this program written to automate),
        # and incorporate and errors from wiki...look for any missing rows, but check redcap visit summary to make sure not already known to be missing
        # then save updated xls files with new date, navigate to file in box and select 'upload new version.'
        # This will allow BOX to track versioning until better system is in place
        site_files = assessments[item]['cleanest_start']
        rows = get_all_rows(site_files)

        # create snapshot of combined file (all four sites together) store
        # snapshot in 'store' and in 'snapshots' under all sites directory in
Ejemplo n.º 2
0
moredata = moredata.drop(columns='study')
moredata = moredata.rename(columns={'studyslim': 'study'})

# get list of files in the endpoint machine to separate out by type
# note that this dir has been whittled down to the files with 2019 in
# their file name to keep it smaller - rsync again and get everything
files2cat = pd.DataFrame(os.listdir(cache_space), columns=['fname'])

# NOW Assemble Complete Raw and Scores datasets ##########################
##########################################################################
# BEGIN with Scores Data
# STEP 1: get cleanest data so far and merge with Redcap to get site and
# study info.  Replace UMN ids, where applicable, and correct any other
# known (historical) issues.
cleanest_scores = 476857277893
scores_path = box.downloadFile(cleanest_scores)
cleanestscores = pd.read_csv(scores_path,
                             header=0,
                             low_memory=False,
                             encoding='ISO-8859-1')
# Extend first occurrence of date finished for a given PIN (if needed)
# to records without date, then pull in redcap vars and merge with catscoresnew
cleanestscores = extend_date(cleanestscores.drop(columns='FirstDate4PIN'),
                             'DateFinished').copy()

# replace the bad UMN ids before merging with redcap - otherwise wont find
# a match
replacements = pd.read_csv(os.path.join(store_space,
                                        'UMN_Subject_ID_replacement_list.csv'),
                           header=0)
IDmap = dict(zip(list(replacements.Old_ID), list(replacements.New_ID)))
Ejemplo n.º 3
0
cleanestdata = 495490047901

# Coordinator monthly update process is to run eprime_getraw.py to 'download' all of the individual records from box
# UCLA and WU upload folders for individual subjects...the python program converts the text files in these folders into rows
# of data for a given subject.  Coordinator role to check for new rows.  The eprime getraw program appends new data
# to the ProcessedBoxFiles_AllRawData_Eprime.csv file under snapshots/ePrimeDD/raw_allfiles_in_box
# Note, this box file is also synced with /home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/eprime/ProcessedBoxFiles_AllRawData_Eprime.csv
# File ids in the store are getting rounded and converted when saved to box...so if you need fileids, grab from store.
# after running eprime_getraw.py, open the the current (and cumulatively cleaned) 'database' under BDAS/
# along with this ProcessedBoxFiles_AllRawData file andn append by hand any new rows to cleaned database by hand,
# at this time, incorporate and errors from wiki...and/or hca data checklist
# then save updated file with new date, navigate to file in box and select 'upload new version.'
# This will allow BOX to track versioning until better system is in place
# Once you've updated the Allsites database, you're ready to begin QC.
# remember to check visit summary for information pertaining to missing rows.
basecachefile = box.downloadFile(cleanestdata)
baseclean = pd.read_csv(basecachefile, header=0, low_memory=False)

studyids = redcap.getredcapids()
studydata = redcap.getredcapdata()
# create snapshot of combined file store snapshot in 'store' and in
# 'snapshots' under all sites directory in box.
snap = 'Eprime_Snapshot_' + snapshotdate + '.csv'
snapshotfile = os.path.join(store_space, snap)
QCfile = os.path.join(cache_space, 'QC_' + snap)
# write to csv in store
baseclean.to_csv(snapshotfile, index=False)
# upload the snapshot into box
box.upload_file(snapshotfile, e_snapshotfolderid)
baseclean = baseclean.loc[baseclean.exclude == 0]