Ejemplo n.º 1
0
newprocessed_reorder = newprocessed[[
    'file_id', 'filename', 'source', 'visit', 'subject', 'SV1wk20', 'SV2wk20',
    'SV1mo20', 'SV6mo20', 'SV1yr20', 'SV3yr20', 'SV1wk100', 'SV2wk100',
    'SV1mo100', 'SV6mo100', 'SV1yr100', 'SV3yr100', 'raw_processed_date'
]]

# check for dups
dups = newprocessed.loc[newprocessed_reorder.duplicated(
    subset=['subject', 'visit'], keep=False)]

newprocessed_reorder.to_csv(processed_file, index=False)

# this is in the behavioral data/snapshots/ePrimeDD/raw_allfiles_in_box/ folder...is not the curated BDAS file nor is it officially a snapshot - just keeping a record of the raw unprocessed download
# box.upload_file(processed_file,82670454492)# first run had to upload
# file - subsequent runs just update
box.update_file(495494179106, processed_file)

shutil.rmtree(box.cache)


def folderlistcontents(folderslabels, folderslist):
    bdasfilelist = pd.DataFrame()
    bdasfolderlist = pd.DataFrame()
    for i in range(len(folderslist)):
        print('getting file and folder contents of box folder ' +
              folderslabels[i])
        # foldercontents generates two dfs: a df with names and ids of files
        # and a df with names and ids of folders
        subfiles, subfolders = foldercontents(folderslist[i])
        bdasfilelist = bdasfilelist.append(subfiles)
        bdasfolderlist = bdasfolderlist.append(subfolders)
# sub4.to_csv(processed_file,index=False) - original initialization of processed file required 'to_csv'
# cat these to the processed file

newprocessed = pd.concat(
    [processed, files4process],
    axis=0,
    sort=True,
)

newprocessed.to_csv(processed_filename, index=False)

# this is in the behavioral data/snapshots/Q/raw_allfiles_in_box/ folder...is not the curated BDAS file - just keeping a record of the raw unprocessed download
# box.upload_file(processed_file,76432368853) first run had to upload file
# - subsequent runs just update
box.update_file(processfile_id, processed_filename)

# In[78]:

shutil.rmtree(box.cache)

# In[ ]:

# In[99]:

cleaned = pd.read_excel(box.readFile(cleanestdata))

# In[212]:

combined = pd.concat([cleaned, files4process], sort=False, ignore_index=True)