Exemple #1
0
def updatecurated(curated_folderid):
    olddata, oldscores = box2dataframe(
        fileid=curated_folderid)  # 309 in each now
    print('old data has ' + str(len(olddata.PIN.unique())) + ' unique PINs')
    print('old scores has ' + str(len(oldscores.PIN.unique())) +
          ' unique PINs')
    print('###########################')
    # contents of folder containing curated data - then contents of corrected folder
    wufiles, wufolders = foldercontents(curated_folderid)
    corrfiles, dummy = folderlistcontents(
        wufolders.foldername, wufolders.folder_id
    )  # .loc[wufolders.foldername=='corrected','folder_id'])
    corrfiles = corrfiles.loc[corrfiles.filename.str.contains('PASSED')]
    if not corrfiles.empty:
        cdata = corrfiles.loc[corrfiles.filename.str.contains('data')]
        cscores = corrfiles.loc[corrfiles.filename.str.contains('scores')]

        # download corrected data
        box.download_files(cdata.file_id)
        box.download_files(cscores.file_id)
        # create catable dataset for corrected data
        hdatainitcorr = catcontents(cdata, box_temp)
        hscoreinitcorr = catcontents(cscores, box_temp)
        print('corrected data has ' + str(len(hdatainitcorr.PIN.unique())) +
              ' unique PINs')
        print('corrected scores has ' + str(len(hscoreinitcorr.PIN.unique())) +
              ' unique PINs')
        print('###########################')

        # get list of ids in this corrected data  #60 for Harvard
        corrl = findpairs(
            hdatainitcorr, hscoreinitcorr
        )  # this is the list of ids in both scored and raw corrected data

        print(
            'Adding any new data from corrected and dropping any data from the removelist'
        )
        # remove the data with PINS from corrected as well as any from droplist
        olddatasub = olddata[~(olddata.PIN.isin(corrl + droplist))]
        oldscoressub = oldscores[~(oldscores.PIN.isin(corrl + droplist))]
        len(olddatasub.PIN.unique())
        len(oldscoressub.PIN.unique())

        # now cat the two datasets together  #have 319 now from WU
        hdatainit = pd.concat(
            [hdatainitcorr, olddatasub], axis=0,
            sort=True)  # these have 60 more unique pins than before...good
        hscoreinit = pd.concat(
            [hscoreinitcorr, oldscoressub], axis=0,
            sort=True)  # these have 60 more than before...good
        print('new data will have ' + str(len(hdatainit.PIN.unique())) +
              ' unique PINs')
        print('new scores will have ' + str(len(hscoreinit.PIN.unique())) +
              ' unique PINs')
        print('###########################')
        return hdatainit, hscoreinit
    else:
        print('No corrected data passed QC (nothing to add at this time)')
        return corrfiles, corrfiles  #empty
def box2dataframe(fileid):
    harvardfiles, harvardfolders = foldercontents(fileid)
    data4process = harvardfiles.loc[~(harvardfiles.filename.str.upper().str.contains('SCORE') == True)]
    scores4process = harvardfiles.loc[harvardfiles.filename.str.upper().str.contains('SCORE') == True]
    data4process=data4process.reset_index()
    scores4process = scores4process.reset_index()
    box.download_files(data4process.file_id)
    box.download_files(scores4process.file_id)
    harvcleandata = pd.read_csv(box_temp+'/'+ data4process.filename[0], header=0,   low_memory=False)
    harvcleanscores = pd.read_csv(box_temp+'/'+ scores4process.filename[0], header=0,  low_memory=False)
    return harvcleandata,harvcleanscores
def curatedandcorrected(curatedfolderid,needsattnfolder):
    harvardfiles, harvardfolders=foldercontents(curatedfolderid)
    #dont grab files that need attention
    harvardfolders=harvardfolders.loc[~(harvardfolders.foldername.str.contains('needs_attention'))]
    harvardfiles2, harvardfolders2=folderlistcontents(harvardfolders.foldername,harvardfolders.folder_id)
    harvardfiles=pd.concat([harvardfiles,harvardfiles2],axis=0,sort=True)

    data4process=harvardfiles.loc[~(harvardfiles.filename.str.upper().str.contains('SCORE')==True)]
    scores4process=harvardfiles.loc[harvardfiles.filename.str.upper().str.contains('SCORE')==True]
    box.download_files(data4process.file_id)
    box.download_files(scores4process.file_id)

    #trick the catcontents macro to create catable dataset, but dont actually cat until you remove the
    #PINS in the corrected file from the curated file
    #step1 - separate data4process/scores4process into corrected and old curated data
    cdata=data4process.loc[data4process.filename.str.contains('corrected')]
    cscores=scores4process.loc[scores4process.filename.str.contains('corrected')]
    olddata=data4process.loc[~(data4process.filename.str.contains('corrected'))]
    oldscores=scores4process.loc[~(scores4process.filename.str.contains('corrected'))]
    #create catable dataset for corrected data
    hdatainitcorr=catcontents(cdata,box_temp)
    hscoreinitcorr=catcontents(cscores,box_temp)
    #get list of ids in this corrected data  #60 for Harvard
    corrl=findpairs(hdatainitcorr,hscoreinitcorr) #this is the list of ids in both scored and raw corrected data

    #create catable dataset for old curated data
    hdatainitold=catcontents(olddata,box_temp)
    hscoreinitold=catcontents(oldscores,box_temp)
    #remove the data with PINS from corrected
    hdatainitoldsub=hdatainitold[~(hdatainitold.PIN.isin(corrl))]
    hscoreinitoldsub=hscoreinitold[~(hscoreinitold.PIN.isin(corrl))]

    #now cat the two datasets together
    hdatainit=pd.concat([hdatainitcorr,hdatainitoldsub],axis=0,sort=True) #these have 60 more unique pins than before...good
    hscoreinit=pd.concat([hscoreinitcorr,hscoreinitoldsub],axis=0,sort=True) #these have 60 more than before...good

    l=findpairs(hdatainit,hscoreinit) #this is the list of ids in both scored and raw data
    #set aside those who arebnt in both and those that are in dlist or slist
    notbothdatalist=hdatainit[~(hdatainit.PIN.isin(l))]
    notbothscorelist=hscoreinit[~(hscoreinit.PIN.isin(l))]
    nbs=list(notbothscorelist.PIN.unique())
    nbd=list(notbothdatalist.PIN.unique())

    hdatainit2=hdatainit[hdatainit.PIN.isin(l)]
    hscoreinit2=hscoreinit[hscoreinit.PIN.isin(l)]
    #check that this is same as above -- it is
    #hdatainit2qc=hdatainit[~(hdatainit.PIN.isin(nbs+nbd))]
    #hscoreinit2qc=hscoreinit[~(hscoreinit.PIN.isin(nbs+nbd))]

    #find instrument duplications that are not identical
    dlist,slist=findwierdos(hdatainit2,hscoreinit2)
    dslist=pd.concat([dlist,slist],axis=0)
    wierdlist=list(dslist.PIN.unique())
    #set aside those who are in the wierdlist
    nonidenticaldupdata=hdatainit2.loc[hdatainit2.PIN.isin(wierdlist)]
    nonidenticaldupscore=hscoreinit2.loc[hscoreinit2.PIN.isin(wierdlist)]
    wierdd=list(dlist.PIN.unique())
    wierds=list(slist.PIN.unique())
    #so we have the notinboth lists and the wierdlists
    #Already set aside the notinbothlists
    #if we exclude any wierdlist PINs from both, this should get rid of everything that isnt one-to-one
    hdatainit3=hdatainit2.loc[~(hdatainit2.PIN.isin(wierdlist))]
    hscoreinit3=hscoreinit2.loc[~(hscoreinit2.PIN.isin(wierdlist))]
    #both have 580 unique ids - make them into a list
    l3=findpairs(hdatainit3,hscoreinit3) #this is the list of ids in both scored and raw data

    dlist,slist=findwierdos(hdatainit3,hscoreinit3)
    #now delete any identical duplicates check for issues finding wierdos
    if dlist.empty and slist.empty:
        hdatainit3=hdatainit3.drop_duplicates(subset={'PIN','Inst','ItemID','Position'},keep='first')
        hscoreinit3=hscoreinit3.drop_duplicates(subset={'PIN','Inst'})
    else:
        print('Found Non-Identical Duplications')
        print(dlist)
        print(slist)

    #export scores and data for all pins in dslist or nbs or nbd with flags
    notbothdatalist.to_csv(box_temp+'/Toolbox_notinboth_Data_'+snapshotdate+'.csv')
    notbothscorelist.to_csv(box_temp+'/Toolbox_notinboth_Scores_'+snapshotdate+'.csv')
    box.upload_file(box_temp+'/Toolbox_notinboth_Data_'+snapshotdate+'.csv',needsattnfolder)
    box.upload_file(box_temp+'/Toolbox_notinboth_Scores_'+snapshotdate+'.csv',needsattnfolder)

    nonidenticaldupdata.to_csv(box_temp+'/Toolbox_NonidentDups_Data_'+snapshotdate+'.csv')
    nonidenticaldupscore.to_csv(box_temp+'/Toolbox_NonidentDups_Scores_'+snapshotdate+'.csv')
    box.upload_file(box_temp+'/Toolbox_NonidentDups_Data_'+snapshotdate+'.csv',needsattnfolder)
    box.upload_file(box_temp+'/Toolbox_NonidentDups_Scores_'+snapshotdate+'.csv',needsattnfolder)

    #last but not least...set aside ids not in REDCap, and IDs that need visit numbers
    #get reds from hdatatinit3 (should be same as list from hscoreinit3)
    #generate hdatainit4 and hscoreinit4 which is relieved of these ids
    hdatainit4=subjectsvisits(hdatainit3)
    hscoreinit4=subjectsvisits(hscoreinit3)
    mv=hscoreinit4.loc[~(hscoreinit4.visit.isin(['V1','V2','V3','X1','X2','X3']))].copy()
    mvs=list(mv.subject.unique())  #list of PINs without visit numbers

    check=subjectpairs(hdatainit4,hscoreinit4) #this number will be fewer because V1 and V2 PINs for same subject only counted once)
    redids=box.getredcapids()
    dfcheck=pd.DataFrame(check,columns=['subject'])
    boxids=pd.merge(dfcheck,redids,how='left',on='subject',indicator=True)
    reds=list(boxids.loc[boxids._merge=='left_only'].subject) #subjects not in redcap
    boxandredcap=boxids.loc[boxids._merge=='both'].subject

    #export the otherwise cleanest data ready for snapshotting as the new updated curated file -- then run this for all sites befo
    #write code here - has only ids with visit numbers and one to one scores and data correspondence and no wierd duplications
    #but check one last time that hdatainit5 and hscoreinit5 is super clean
    hdatainit5=hdatainit4.loc[~(hdatainit4.subject.isin(mvs+reds))]
    hscoreinit5=hscoreinit4.loc[~(hscoreinit4.subject.isin(mvs+reds))]


    #export the lists of ids and reasons they were excluded
    df=pd.DataFrame(columns=['reason','affectedIDs'])
    df=df.append({'reason': 'PIN In Scores but not Data', 'affectedIDs': nbs}, ignore_index=True)
    df=df.append({'reason': 'PIN In Data but not Scores', 'affectedIDs': nbd}, ignore_index=True)
    df=df.append({'reason': 'PIN/Instrument Non-identical Duplication in Data', 'affectedIDs': wierdd}, ignore_index=True)
    df=df.append({'reason': 'PIN/Instrument Non-identical Duplication in Scores', 'affectedIDs': wierds}, ignore_index=True)
    df=df.append({'reason': 'PIN/subject in Scores and Data but missing visit', 'affectedIDs': mvs}, ignore_index=True)
    df=df.append({'reason': 'subject in Scores and Data but not REDCap ', 'affectedIDs': reds}, ignore_index=True)


    df.to_csv(box_temp+'/List_of_IDs_and_Reasons_they_in_these_files_'+snapshotdate+'.csv')
    box.upload_file(box_temp+'/List_of_IDs_and_Reasons_they_in_these_files_'+snapshotdate+'.csv',needsattnfolder)

    return hdatainit5,hscoreinit5
Exemple #4
0
processed = pd.concat([processed1, processed2, processed3], axis=0)
processed.sort_values("file_id", inplace=True)
processed.drop_duplicates(subset='file_id', keep='first', inplace=True)

files['file_id'] = files.file_id.astype('int')

files4process = pd.merge(files, processed, on='file_id', how='left')
files4process = files4process.loc[files4process.raw_cat_date.isnull()].copy()

files4process.drop(columns=['raw_cat_date'], inplace=True)
files4process['file_id'] = files4process.file_id.astype('str')
print('Found ' + str(len(files4process.file_id)) +
      ' new files in Box/Endpoints folder on ' + snapshotdate)

# download everything not already processed to cache
box.download_files(files4process.file_id)
# files4process=files.head(50)
# box.download_files(files4process.file_id)

scoresfiles = files4process.loc[
    files4process.filename.str.contains('Scores')
    & ~(files4process.filename.str.lower().str.contains('corrected'))]
rawdatafiles = files4process.loc[
    files4process.filename.str.contains('Assessment')
    & ~(files4process.filename.str.contains('Scores'))]
corrected = files4process.loc[files4process.filename.str.lower().str.contains(
    'corrected')]

scoresinit = catcontents(scoresfiles)
rawdatainit = catcontents(rawdatafiles)
correctedinit = catcontents(corrected)