# split by study and site - remember that hcd parents who are also hca are # listed under BOTH for study and their hca ID. Make sure filename # reflects this. HCAorBoth = curatedwithcatreordered.loc[~(curatedwithcatreordered.study == 'HCD')] # NOT (notice ~) HCD data # HCD data HCDonly = curatedwithcatreordered.loc[curatedwithcatreordered.study == 'HCD'] WashU_HCAorBoth = HCAorBoth.loc[HCAorBoth.site == '4'].copy() WashU_HCAorBoth_storefile = os.path.join( store_space, 'WashU_HCAorBoth_Toolbox_Scored_Combined_' + snapshotdate + '.csv') WashU_HCAorBoth.to_csv(WashU_HCAorBoth_storefile, index=False) box.upload_file(WashU_HCAorBoth_storefile, 82804729845) WashU_HCDonly = HCDonly.loc[HCDonly.site == '4'].copy() WashU_HCDonly_storefile = os.path.join( store_space, 'WashU_HCDonly_Toolbox_Scored_Combined_' + snapshotdate + '.csv') WashU_HCDonly.to_csv(WashU_HCDonly_storefile, index=False) box.upload_file(WashU_HCDonly_storefile, 82804015457) UCLA_HCAorBoth = HCAorBoth.loc[HCAorBoth.site == '2'].copy() UCLA_HCAorBoth_storefile = os.path.join( store_space, 'UCLA_HCAorBoth_Toolbox_Scored_Combined_' + snapshotdate + '.csv') UCLA_HCAorBoth.to_csv(UCLA_HCAorBoth_storefile, index=False) box.upload_file(UCLA_HCAorBoth_storefile, 82807223120)
hdatainit=hdatainit.drop_duplicates(subset={'PIN','Inst','ItemID','Position'},keep='first') hscoreinit=hscoreinit.drop_duplicates(subset={'PIN','Inst'}) else: print('Found Non-Identical Duplications') print(dlist) print(slist) l=findpairs(hdatainit,hscoreinit) #this is the list of ids in both scored and raw data #keep the ones that have no nan pins hdatainit=hdatainit[hdatainit.PIN.isin(l)] hscoreinit=hscoreinit[hscoreinit.PIN.isin(l)] #upload the concatenated files to site directory in box and move other files to incorporated hdatainit.to_csv(box_temp+'/harvard_corrected_data'+snapshotdate+'.csv') hscoreinit.to_csv(box_temp+'/harvard_corrected_scores'+snapshotdate+'.csv') box.upload_file(box_temp+'/harvard_corrected_data'+snapshotdate+'.csv',Harvard) box.upload_file(box_temp+'/harvard_corrected_scores'+snapshotdate+'.csv',Harvard) #all files associated with this snapshotdate moved to incorporated_snapshotdate folder under this #corrected folder ######################################### MGH=84799213727 mghfiles, mghfolders=foldercontents(MGH) data4process=mghfiles.loc[(mghfiles.filename.str.contains('Data')==True) | (mghfiles.filename.str.contains('Raw')==True)] scores4process=mghfiles.loc[mghfiles.filename.str.contains('Score')==True] box.download_files(data4process.file_id) box.download_files(scores4process.file_id) mdatainit=catcontents(data4process,box_temp)
print('Summary of Failures') print(data4process.loc[data4process.Fail == 1]) print(scores4process.loc[scores4process.Fail == 1]) print('##################################################') sys.stdout = orig_stdout f.close() droplist data4process.groupby('Fail').count() scores4process.groupby('Fail').count() wdatainit.loc[~(wdatainit.PIN.isin(droplist))].to_csv( box_temp + '/wudPASSED_corrected_data' + snapshotdate + '.csv') wscoreinit.loc[~(wscoreinit.PIN.isin(droplist))].to_csv( box_temp + '/wudPASSED_corrected_scores' + snapshotdate + '.csv') box.upload_file(box_temp + '/wudPASSED_corrected_data' + snapshotdate + '.csv', WashuD) box.upload_file( box_temp + '/wudPASSED_corrected_scores' + snapshotdate + '.csv', WashuD) ################################################################################################## WashuA = 84799623206 curated = 82804729845 wuafiles, wuafolders = foldercontents(WashuA) data4process = wuafiles.loc[(wuafiles.filename.str.contains('aw_') == True) | (wuafiles.filename.str.contains('Raw') == True)] scores4process = wuafiles.loc[wuafiles.filename.str.contains('core') == True] data4process['PIN'] = data4process.filename.str[:13] scores4process['PIN'] = scores4process.filename.str[:13] data4process['Fail'] = 1 scores4process['Fail'] = 1
studyids = redcap.getredcapids() studydata = redcap.getredcapdata() # %% for item in asslist.assessment: snap[item] = pd.concat([db[item], db2[item]], axis=1) # create snapshot of combined file store snapshot in 'store' and in # 'snapshots' under all sites directory in box. snapfilename = 'Q_' + item + '_Snapshot_' + snapshotdate + '.csv' snapshotfile = os.path.join(store_space, snapfilename) QCfile = os.path.join(cache_space, 'QC_' + snapfilename) snap[item].to_csv(snapshotfile, index=False) box.upload_file(snapshotfile, q_snapshotfolderid) # %% allrowsofinterest = pd.concat(snap.values(), sort=False) allrowsofinterest = allrowsofinterest[['subject', 'source', 'assessment']] combined = allrowsofinterest.merge(studyids, 'left', 'subject') notinredcap = combined.loc[combined.Subject_ID.isnull()].copy() notinredcap['reason'] = 'PatientID not in Redcap' # %% combined = allrowsofinterest.merge(studydata, 'right', 'subject') notinboxunique = combined.loc[combined.source.isnull() & combined.flagged.isnull()].drop_duplicates(