Ejemplo n.º 1
import pycurl
import sys
import shutil
from openpyxl import load_workbook
import pandas as pd
import download.box
from io import BytesIO
import numpy as np
import subprocess
from scipy import stats


from download.box import LifespanBox
box_temp='/home/petra/UbWinSharedSpace1/boxtemp' #location of local copy of curated data
box = LifespanBox(cache=box_temp)

verbose = True
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')
#catfromdate=max of last run--'2019-06-17'

#ped file will have all the nda vars from intradb, plus randomization status and


Ejemplo n.º 2
import subprocess

import pandas as pd
from config import config

from download.box import LifespanBox
from download.pennCNP import PennCNP
from download.redcap import Redcap

# verbose = False
verbose = True
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')
ksads_cache_path = config['dirs']['cache']['ksads']

# connect to Box
box = LifespanBox(cache=ksads_cache_path, config_file=config['box'])
site_file = config['PennCNP']['snapshot']

def loadYaml(filename):
    if not os.path.exists(filename):
        return None

    with open(filename, 'r') as fd:
        return yaml.load(fd, Loader=yaml.SafeLoader)

def main():
    parser = argparse.ArgumentParser(
        description="Downloads the data from PennCNP")
    user_group = parser.add_mutually_exclusive_group()
Ejemplo n.º 3
import os
import sys
import shutil
import pandas
from download.box import LifespanBox

verbose = True

root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
cache_space = os.path.join(root_dir, 'cache', 'toolbox')
combined_path = os.path.join(cache_space, 'Toolbox_Combined.csv')
toolbox_folder_id = 42902161768
box = LifespanBox(cache=cache_space)

# label_errors = []
# instrument_errors = []

# hca_path = os.path.join(root_dir, 'store', 'toolbox-hca-instruments.txt')
# with open(hca_path) as f:
#     hca_instruments = f.read().splitlines()

# hcd_path = os.path.join(root_dir, 'store', 'toolbox-hcd-instruments.txt')
# with open(hcd_path) as f:
#     hcd_instruments = f.read().splitlines()

# par_path = os.path.join(root_dir, 'store', 'toolbox-parent-instruments.txt')
# with open(par_path) as f:
#     par_instruments = f.read().splitlines()
import os, datetime
import pandas as pd

from download.box import LifespanBox
import sys

verbose = True
#verbose = False
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')

#Two types of files to curate...the so called raw data from which scores are generated and the scores themeselves.
#connect to Box (to get latest greatest curated stuff)
box_temp = '/home/petra/UbWinSharedSpace1/boxtemp'  #location of local copy of curated data
box = LifespanBox(cache=box_temp)
redcapconfigfile = "/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/.boxApp/redcapconfig.csv"

removelist = pd.read_csv(
    os.path.join(box_temp, 'RemoveFromCurated_perTrello27May2020.csv'))


#get list of filenames
WashuD = 84801037257
curated = 82804015457

wudfiles, wudfolders = foldercontents(WashuD)
#wudfiles2, wudfolders2=folderlistcontents(wudfolders.foldername,wudfolders.folder_id)
data4process = wudfiles.loc[(wudfiles.filename.str.contains('aw_') == True) |
Ejemplo n.º 5
####initiate data that is required for both scores and raw data types#####
import os

import pandas as pd

from download import redcap
from download.box import LifespanBox

verbose = True
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')
root_cache = '/data/intradb/tmp/box2nda_cache/'
# dont delete cache at the end of this program until endpoint machine is
# back up and running
cache_space = os.path.join(root_cache, 'endpointmachine/lifespan')
box = LifespanBox(cache=cache_space)

root_store = '/home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/'
# this will be the place to save any snapshots on the nrg servers
store_space = os.path.join(root_store, 'toolbox')
    os.mkdir(store_space)  # look for store space before creating it here
except BaseException:
    print("store already exists")

# prep basic redcap data ####################3
# need this so sites and studies can be assigned to curated data rows (if missing).
# Data cant be split by sites unless this info included somehow.
moredata = redcap.getfullredcapdata()
# set these vars to missing because they belong to child of parent and could conflict
# (different QC pgrograms look for this type of inconsistency between REDCap Databases)
Ejemplo n.º 6
 def __init__(self, box=None):
     if box is None:
         box = LifespanBox()
     self.box = box
Ejemplo n.º 7
root_store = '/home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/'
# this will be the place to save any snapshots on the nrg servers
store_space = os.path.join(root_store, 'eprime')
    os.mkdir(store_space)  # look for store space before creating it here
except BaseException:
    print("store already exists")

processed_file = os.path.join(store_space,
available_box_files = os.path.join(cache_space, 'AllBoxFiles_Eprime.csv')

# generate the box object which contains the necessary client config to
# talks to box, and sets up the cache space
box = LifespanBox(cache=cache_space)

# this section will
# necessary to search folders with
# generate list of all files in Q directories and identify those that dont follow pattern
# each of the site folders contains individual folders
sitefolderslabels = ["WUHCD", "UCLAHCD"]  # ,"UMNHCASUB"]
sitefolderlist = [41361544018, 61956482658]

# get folder contents for all the sites including the known subfolder of individuals folders
# folderlistcontents generates two dfs: a df with names and ids of files
# and a df with names and ids of folders
superfilelist, superfolderlist = folderlistcontents(
    sitefolderlist)  # 2378 files and 1 folders as of 5/22/2019
if (superfilelist.shape[0] == 0):
Ejemplo n.º 8
import pandas as pd
import numpy as np
from config import config
from download import redcap

from download.box import LifespanBox
from download.redcap import Redcap

verbose = True
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')
ksads_cache_path = config['dirs']['cache']['ksads']
# this will be the place to save any snapshots on the nrg servers
store_space = config['dirs']['store']['ksads']

# connect to Box
box = LifespanBox(cache=ksads_cache_path, config_file=config['box'])
redcap = Redcap(config['redcap']['config'])
assessments = config['Assessments']
sites = config['Sites']
# verbose = False

# snapshot folder (used to be the combined folder)
ksads_snapshotfolderid = config['ksads_snapshotfolderid']
snapshotQCfolder = config['snapshotQCfolder']

# download one of the identical key files which contain the labels for
# all the numbered questions in KSADS
cachekeyfile = box.downloadFile(config['cachekeyfile'])

def main():
# In[3]:

verbose = True
snapshotdate = datetime.datetime.today().strftime('%Y-%m-%d')
cache_space = config['dirs']['cache']['qint']
store_space = config['dirs']['store']['qint']

processed_filename = os.path.join(
    store_space, 'ProcessedBoxFiles_AllRawData_Qinteractive.csv')
combined_filename = os.path.join(
    store_space, 'HCA-HCD_Allsites_QandRAVLT_%s.xlsx' % snapshotdate)
available_box_files = os.path.join(cache_space, 'AllBoxFiles_Qinteractive.csv')

# In[4]:

box = LifespanBox(cache=cache_space)

# In[5]:

sites = {
    18446355408: 'WUHCD',
    18446433567: 'WUHCA',
    18446318727: 'UMNHCD',
    18446298983: 'UMNHCA',
    18446352116: 'UCLAHCD',
    18446404271: 'UCLAHCA',
    18446321439: 'HARVHCD',
    18446404071: 'MGHHCA',
    #    47239506949: 'UMNHCASUB'
bdas_folders = {75755393630: 'BDAS_HCD', 75755777913: 'BDAS_HCA'}
Ejemplo n.º 10
import os, datetime
import pandas as pd

from download.box import LifespanBox

verbose = True
#verbose = False
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')

#Two types of files to curate...the so called raw data from which scores are generated and the scores themeselves.
#connect to Box (to get latest greatest curated stuff)
box_temp='/home/petra/UbWinSharedSpace1/boxtemp' #location of local copy of curated data
box = LifespanBox(cache=box_temp)

#start with data that is result of extensive QC effort from sites.
#keep track of expected and observed IDs
#curate list of TBX issues.
#pull in data (by ID) that not on list of issues

#get list of filenames
harvardfiles, harvardfolders=foldercontents(Harvard)
harvardfiles2, harvardfolders2=folderlistcontents(harvardfolders.foldername,harvardfolders.folder_id)

Ejemplo n.º 11
import pandas as pd
from download.box import LifespanBox
from download.redcap import Redcap
from config import config

redcap = Redcap()

verbose = True
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')

columnnames = config['QIntColumns']
cache_space = config['dirs']['cache']['qint']
store_space = config['dirs']['store']['qint']

# connect to Box
box = LifespanBox(cache=cache_space)

# snapshot folder (used to be the combined folder)
q_snapshotfolderid = 48203213208
snapshotQCfolder = 76434619813
cleanestdata = 465568117756

# %%

baseclean = pd.read_excel(box.readFile(cleanestdata))
basecleanexcluded = baseclean.loc[baseclean.source == 'perm-missing']
baseclean = baseclean.loc[baseclean.select_4clean == 1]
baseclean.row = baseclean.row.str.replace('-', '')

asslist = baseclean.groupby('assessment').count()
Ejemplo n.º 12
cache_space = os.path.join(root_cache, 'eprime')
except BaseException:
    print("cache already exists")

root_store = '/home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/'
# this will be the place to save any snapshots on the nrg servers
store_space = os.path.join(root_store, 'eprime')
    os.mkdir(store_space)  # look for store space before creating it here
except BaseException:
    print("store already exists")

# connect to Box
box = LifespanBox(cache=cache_space)
redcap = Redcap('../tmp/.boxApp/redcapconfig.csv')

# snapshot folder (used to be the combined folder)
e_snapshotfolderid = 82670538107
snapshotQCfolder = 76434619813
slimfolder = 82670800769  # (for data dictionaries)
cleanestdata = 495490047901

# Coordinator monthly update process is to run eprime_getraw.py to 'download' all of the individual records from box
# UCLA and WU upload folders for individual subjects...the python program converts the text files in these folders into rows
# of data for a given subject.  Coordinator role to check for new rows.  The eprime getraw program appends new data
# to the ProcessedBoxFiles_AllRawData_Eprime.csv file under snapshots/ePrimeDD/raw_allfiles_in_box
# Note, this box file is also synced with /home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/eprime/ProcessedBoxFiles_AllRawData_Eprime.csv
# File ids in the store are getting rounded and converted when saved to box...so if you need fileids, grab from store.
# after running eprime_getraw.py, open the the current (and cumulatively cleaned) 'database' under BDAS/