def updatecurated(curated_folderid): olddata, oldscores = box2dataframe( fileid=curated_folderid) # 309 in each now print('old data has ' + str(len(olddata.PIN.unique())) + ' unique PINs') print('old scores has ' + str(len(oldscores.PIN.unique())) + ' unique PINs') print('###########################') # contents of folder containing curated data - then contents of corrected folder wufiles, wufolders = foldercontents(curated_folderid) corrfiles, dummy = folderlistcontents( wufolders.foldername, wufolders.folder_id ) # .loc[wufolders.foldername=='corrected','folder_id']) corrfiles = corrfiles.loc[corrfiles.filename.str.contains('PASSED')] if not corrfiles.empty: cdata = corrfiles.loc[corrfiles.filename.str.contains('data')] cscores = corrfiles.loc[corrfiles.filename.str.contains('scores')] # download corrected data box.download_files(cdata.file_id) box.download_files(cscores.file_id) # create catable dataset for corrected data hdatainitcorr = catcontents(cdata, box_temp) hscoreinitcorr = catcontents(cscores, box_temp) print('corrected data has ' + str(len(hdatainitcorr.PIN.unique())) + ' unique PINs') print('corrected scores has ' + str(len(hscoreinitcorr.PIN.unique())) + ' unique PINs') print('###########################') # get list of ids in this corrected data #60 for Harvard corrl = findpairs( hdatainitcorr, hscoreinitcorr ) # this is the list of ids in both scored and raw corrected data print( 'Adding any new data from corrected and dropping any data from the removelist' ) # remove the data with PINS from corrected as well as any from droplist olddatasub = olddata[~(olddata.PIN.isin(corrl + droplist))] oldscoressub = oldscores[~(oldscores.PIN.isin(corrl + droplist))] len(olddatasub.PIN.unique()) len(oldscoressub.PIN.unique()) # now cat the two datasets together #have 319 now from WU hdatainit = pd.concat( [hdatainitcorr, olddatasub], axis=0, sort=True) # these have 60 more unique pins than before...good hscoreinit = pd.concat( [hscoreinitcorr, oldscoressub], axis=0, sort=True) # these have 60 more than before...good print('new data will have ' + str(len(hdatainit.PIN.unique())) + ' unique PINs') print('new scores will have ' + str(len(hscoreinit.PIN.unique())) + ' unique PINs') print('###########################') return hdatainit, hscoreinit else: print('No corrected data passed QC (nothing to add at this time)') return corrfiles, corrfiles #empty
def box2dataframe(fileid): harvardfiles, harvardfolders = foldercontents(fileid) data4process = harvardfiles.loc[~(harvardfiles.filename.str.upper().str.contains('SCORE') == True)] scores4process = harvardfiles.loc[harvardfiles.filename.str.upper().str.contains('SCORE') == True] data4process=data4process.reset_index() scores4process = scores4process.reset_index() box.download_files(data4process.file_id) box.download_files(scores4process.file_id) harvcleandata = pd.read_csv(box_temp+'/'+ data4process.filename[0], header=0, low_memory=False) harvcleanscores = pd.read_csv(box_temp+'/'+ scores4process.filename[0], header=0, low_memory=False) return harvcleandata,harvcleanscores
def curatedandcorrected(curatedfolderid,needsattnfolder): harvardfiles, harvardfolders=foldercontents(curatedfolderid) #dont grab files that need attention harvardfolders=harvardfolders.loc[~(harvardfolders.foldername.str.contains('needs_attention'))] harvardfiles2, harvardfolders2=folderlistcontents(harvardfolders.foldername,harvardfolders.folder_id) harvardfiles=pd.concat([harvardfiles,harvardfiles2],axis=0,sort=True) data4process=harvardfiles.loc[~(harvardfiles.filename.str.upper().str.contains('SCORE')==True)] scores4process=harvardfiles.loc[harvardfiles.filename.str.upper().str.contains('SCORE')==True] box.download_files(data4process.file_id) box.download_files(scores4process.file_id) #trick the catcontents macro to create catable dataset, but dont actually cat until you remove the #PINS in the corrected file from the curated file #step1 - separate data4process/scores4process into corrected and old curated data cdata=data4process.loc[data4process.filename.str.contains('corrected')] cscores=scores4process.loc[scores4process.filename.str.contains('corrected')] olddata=data4process.loc[~(data4process.filename.str.contains('corrected'))] oldscores=scores4process.loc[~(scores4process.filename.str.contains('corrected'))] #create catable dataset for corrected data hdatainitcorr=catcontents(cdata,box_temp) hscoreinitcorr=catcontents(cscores,box_temp) #get list of ids in this corrected data #60 for Harvard corrl=findpairs(hdatainitcorr,hscoreinitcorr) #this is the list of ids in both scored and raw corrected data #create catable dataset for old curated data hdatainitold=catcontents(olddata,box_temp) hscoreinitold=catcontents(oldscores,box_temp) #remove the data with PINS from corrected hdatainitoldsub=hdatainitold[~(hdatainitold.PIN.isin(corrl))] hscoreinitoldsub=hscoreinitold[~(hscoreinitold.PIN.isin(corrl))] #now cat the two datasets together hdatainit=pd.concat([hdatainitcorr,hdatainitoldsub],axis=0,sort=True) #these have 60 more unique pins than before...good hscoreinit=pd.concat([hscoreinitcorr,hscoreinitoldsub],axis=0,sort=True) #these have 60 more than before...good l=findpairs(hdatainit,hscoreinit) #this is the list of ids in both scored and raw data #set aside those who arebnt in both and those that are in dlist or slist notbothdatalist=hdatainit[~(hdatainit.PIN.isin(l))] notbothscorelist=hscoreinit[~(hscoreinit.PIN.isin(l))] nbs=list(notbothscorelist.PIN.unique()) nbd=list(notbothdatalist.PIN.unique()) hdatainit2=hdatainit[hdatainit.PIN.isin(l)] hscoreinit2=hscoreinit[hscoreinit.PIN.isin(l)] #check that this is same as above -- it is #hdatainit2qc=hdatainit[~(hdatainit.PIN.isin(nbs+nbd))] #hscoreinit2qc=hscoreinit[~(hscoreinit.PIN.isin(nbs+nbd))] #find instrument duplications that are not identical dlist,slist=findwierdos(hdatainit2,hscoreinit2) dslist=pd.concat([dlist,slist],axis=0) wierdlist=list(dslist.PIN.unique()) #set aside those who are in the wierdlist nonidenticaldupdata=hdatainit2.loc[hdatainit2.PIN.isin(wierdlist)] nonidenticaldupscore=hscoreinit2.loc[hscoreinit2.PIN.isin(wierdlist)] wierdd=list(dlist.PIN.unique()) wierds=list(slist.PIN.unique()) #so we have the notinboth lists and the wierdlists #Already set aside the notinbothlists #if we exclude any wierdlist PINs from both, this should get rid of everything that isnt one-to-one hdatainit3=hdatainit2.loc[~(hdatainit2.PIN.isin(wierdlist))] hscoreinit3=hscoreinit2.loc[~(hscoreinit2.PIN.isin(wierdlist))] #both have 580 unique ids - make them into a list l3=findpairs(hdatainit3,hscoreinit3) #this is the list of ids in both scored and raw data dlist,slist=findwierdos(hdatainit3,hscoreinit3) #now delete any identical duplicates check for issues finding wierdos if dlist.empty and slist.empty: hdatainit3=hdatainit3.drop_duplicates(subset={'PIN','Inst','ItemID','Position'},keep='first') hscoreinit3=hscoreinit3.drop_duplicates(subset={'PIN','Inst'}) else: print('Found Non-Identical Duplications') print(dlist) print(slist) #export scores and data for all pins in dslist or nbs or nbd with flags notbothdatalist.to_csv(box_temp+'/Toolbox_notinboth_Data_'+snapshotdate+'.csv') notbothscorelist.to_csv(box_temp+'/Toolbox_notinboth_Scores_'+snapshotdate+'.csv') box.upload_file(box_temp+'/Toolbox_notinboth_Data_'+snapshotdate+'.csv',needsattnfolder) box.upload_file(box_temp+'/Toolbox_notinboth_Scores_'+snapshotdate+'.csv',needsattnfolder) nonidenticaldupdata.to_csv(box_temp+'/Toolbox_NonidentDups_Data_'+snapshotdate+'.csv') nonidenticaldupscore.to_csv(box_temp+'/Toolbox_NonidentDups_Scores_'+snapshotdate+'.csv') box.upload_file(box_temp+'/Toolbox_NonidentDups_Data_'+snapshotdate+'.csv',needsattnfolder) box.upload_file(box_temp+'/Toolbox_NonidentDups_Scores_'+snapshotdate+'.csv',needsattnfolder) #last but not least...set aside ids not in REDCap, and IDs that need visit numbers #get reds from hdatatinit3 (should be same as list from hscoreinit3) #generate hdatainit4 and hscoreinit4 which is relieved of these ids hdatainit4=subjectsvisits(hdatainit3) hscoreinit4=subjectsvisits(hscoreinit3) mv=hscoreinit4.loc[~(hscoreinit4.visit.isin(['V1','V2','V3','X1','X2','X3']))].copy() mvs=list(mv.subject.unique()) #list of PINs without visit numbers check=subjectpairs(hdatainit4,hscoreinit4) #this number will be fewer because V1 and V2 PINs for same subject only counted once) redids=box.getredcapids() dfcheck=pd.DataFrame(check,columns=['subject']) boxids=pd.merge(dfcheck,redids,how='left',on='subject',indicator=True) reds=list(boxids.loc[boxids._merge=='left_only'].subject) #subjects not in redcap boxandredcap=boxids.loc[boxids._merge=='both'].subject #export the otherwise cleanest data ready for snapshotting as the new updated curated file -- then run this for all sites befo #write code here - has only ids with visit numbers and one to one scores and data correspondence and no wierd duplications #but check one last time that hdatainit5 and hscoreinit5 is super clean hdatainit5=hdatainit4.loc[~(hdatainit4.subject.isin(mvs+reds))] hscoreinit5=hscoreinit4.loc[~(hscoreinit4.subject.isin(mvs+reds))] #export the lists of ids and reasons they were excluded df=pd.DataFrame(columns=['reason','affectedIDs']) df=df.append({'reason': 'PIN In Scores but not Data', 'affectedIDs': nbs}, ignore_index=True) df=df.append({'reason': 'PIN In Data but not Scores', 'affectedIDs': nbd}, ignore_index=True) df=df.append({'reason': 'PIN/Instrument Non-identical Duplication in Data', 'affectedIDs': wierdd}, ignore_index=True) df=df.append({'reason': 'PIN/Instrument Non-identical Duplication in Scores', 'affectedIDs': wierds}, ignore_index=True) df=df.append({'reason': 'PIN/subject in Scores and Data but missing visit', 'affectedIDs': mvs}, ignore_index=True) df=df.append({'reason': 'subject in Scores and Data but not REDCap ', 'affectedIDs': reds}, ignore_index=True) df.to_csv(box_temp+'/List_of_IDs_and_Reasons_they_in_these_files_'+snapshotdate+'.csv') box.upload_file(box_temp+'/List_of_IDs_and_Reasons_they_in_these_files_'+snapshotdate+'.csv',needsattnfolder) return hdatainit5,hscoreinit5
processed = pd.concat([processed1, processed2, processed3], axis=0) processed.sort_values("file_id", inplace=True) processed.drop_duplicates(subset='file_id', keep='first', inplace=True) files['file_id'] = files.file_id.astype('int') files4process = pd.merge(files, processed, on='file_id', how='left') files4process = files4process.loc[files4process.raw_cat_date.isnull()].copy() files4process.drop(columns=['raw_cat_date'], inplace=True) files4process['file_id'] = files4process.file_id.astype('str') print('Found ' + str(len(files4process.file_id)) + ' new files in Box/Endpoints folder on ' + snapshotdate) # download everything not already processed to cache box.download_files(files4process.file_id) # files4process=files.head(50) # box.download_files(files4process.file_id) scoresfiles = files4process.loc[ files4process.filename.str.contains('Scores') & ~(files4process.filename.str.lower().str.contains('corrected'))] rawdatafiles = files4process.loc[ files4process.filename.str.contains('Assessment') & ~(files4process.filename.str.contains('Scores'))] corrected = files4process.loc[files4process.filename.str.lower().str.contains( 'corrected')] scoresinit = catcontents(scoresfiles) rawdatainit = catcontents(rawdatafiles) correctedinit = catcontents(corrected)