Ejemplo n.º 1
0
# split by study and site - remember that hcd parents who are also hca are
# listed under BOTH for study and their hca ID.  Make sure filename
# reflects this.

HCAorBoth = curatedwithcatreordered.loc[~(curatedwithcatreordered.study
                                          == 'HCD')]  # NOT (notice ~) HCD data
# HCD data
HCDonly = curatedwithcatreordered.loc[curatedwithcatreordered.study == 'HCD']

WashU_HCAorBoth = HCAorBoth.loc[HCAorBoth.site == '4'].copy()
WashU_HCAorBoth_storefile = os.path.join(
    store_space,
    'WashU_HCAorBoth_Toolbox_Scored_Combined_' + snapshotdate + '.csv')
WashU_HCAorBoth.to_csv(WashU_HCAorBoth_storefile, index=False)
box.upload_file(WashU_HCAorBoth_storefile, 82804729845)

WashU_HCDonly = HCDonly.loc[HCDonly.site == '4'].copy()
WashU_HCDonly_storefile = os.path.join(
    store_space,
    'WashU_HCDonly_Toolbox_Scored_Combined_' + snapshotdate + '.csv')
WashU_HCDonly.to_csv(WashU_HCDonly_storefile, index=False)
box.upload_file(WashU_HCDonly_storefile, 82804015457)

UCLA_HCAorBoth = HCAorBoth.loc[HCAorBoth.site == '2'].copy()
UCLA_HCAorBoth_storefile = os.path.join(
    store_space,
    'UCLA_HCAorBoth_Toolbox_Scored_Combined_' + snapshotdate + '.csv')
UCLA_HCAorBoth.to_csv(UCLA_HCAorBoth_storefile, index=False)
box.upload_file(UCLA_HCAorBoth_storefile, 82807223120)
Ejemplo n.º 2
0
    hdatainit=hdatainit.drop_duplicates(subset={'PIN','Inst','ItemID','Position'},keep='first')
    hscoreinit=hscoreinit.drop_duplicates(subset={'PIN','Inst'})
else:
    print('Found Non-Identical Duplications')
    print(dlist)
    print(slist)

l=findpairs(hdatainit,hscoreinit) #this is the list of ids in both scored and raw data
#keep the ones that have no nan pins
hdatainit=hdatainit[hdatainit.PIN.isin(l)]
hscoreinit=hscoreinit[hscoreinit.PIN.isin(l)]

#upload the concatenated files to site directory in box and move other files to incorporated
hdatainit.to_csv(box_temp+'/harvard_corrected_data'+snapshotdate+'.csv')
hscoreinit.to_csv(box_temp+'/harvard_corrected_scores'+snapshotdate+'.csv')
box.upload_file(box_temp+'/harvard_corrected_data'+snapshotdate+'.csv',Harvard)
box.upload_file(box_temp+'/harvard_corrected_scores'+snapshotdate+'.csv',Harvard)

#all files associated with this snapshotdate moved to incorporated_snapshotdate folder under this
#corrected folder


#########################################
MGH=84799213727
mghfiles, mghfolders=foldercontents(MGH)
data4process=mghfiles.loc[(mghfiles.filename.str.contains('Data')==True) | (mghfiles.filename.str.contains('Raw')==True)]
scores4process=mghfiles.loc[mghfiles.filename.str.contains('Score')==True]
box.download_files(data4process.file_id)
box.download_files(scores4process.file_id)

mdatainit=catcontents(data4process,box_temp)
print('Summary of Failures')
print(data4process.loc[data4process.Fail == 1])
print(scores4process.loc[scores4process.Fail == 1])
print('##################################################')
sys.stdout = orig_stdout
f.close()
droplist
data4process.groupby('Fail').count()
scores4process.groupby('Fail').count()

wdatainit.loc[~(wdatainit.PIN.isin(droplist))].to_csv(
    box_temp + '/wudPASSED_corrected_data' + snapshotdate + '.csv')
wscoreinit.loc[~(wscoreinit.PIN.isin(droplist))].to_csv(
    box_temp + '/wudPASSED_corrected_scores' + snapshotdate + '.csv')
box.upload_file(box_temp + '/wudPASSED_corrected_data' + snapshotdate + '.csv',
                WashuD)
box.upload_file(
    box_temp + '/wudPASSED_corrected_scores' + snapshotdate + '.csv', WashuD)

##################################################################################################
WashuA = 84799623206
curated = 82804729845
wuafiles, wuafolders = foldercontents(WashuA)
data4process = wuafiles.loc[(wuafiles.filename.str.contains('aw_') == True) |
                            (wuafiles.filename.str.contains('Raw') == True)]
scores4process = wuafiles.loc[wuafiles.filename.str.contains('core') == True]
data4process['PIN'] = data4process.filename.str[:13]
scores4process['PIN'] = scores4process.filename.str[:13]
data4process['Fail'] = 1
scores4process['Fail'] = 1
Ejemplo n.º 4
0
studyids = redcap.getredcapids()
studydata = redcap.getredcapdata()

# %%

for item in asslist.assessment:
    snap[item] = pd.concat([db[item], db2[item]], axis=1)

    # create snapshot of combined file store snapshot in 'store' and in
    # 'snapshots' under all sites directory in box.
    snapfilename = 'Q_' + item + '_Snapshot_' + snapshotdate + '.csv'
    snapshotfile = os.path.join(store_space, snapfilename)
    QCfile = os.path.join(cache_space, 'QC_' + snapfilename)

    snap[item].to_csv(snapshotfile, index=False)
    box.upload_file(snapshotfile, q_snapshotfolderid)

# %%

allrowsofinterest = pd.concat(snap.values(), sort=False)
allrowsofinterest = allrowsofinterest[['subject', 'source', 'assessment']]

combined = allrowsofinterest.merge(studyids, 'left', 'subject')
notinredcap = combined.loc[combined.Subject_ID.isnull()].copy()
notinredcap['reason'] = 'PatientID not in Redcap'

# %%

combined = allrowsofinterest.merge(studydata, 'right', 'subject')
notinboxunique = combined.loc[combined.source.isnull()
                              & combined.flagged.isnull()].drop_duplicates(