def upl_qap_output(cfg_file): ''' ''' # Import packages from CPAC.AWS import aws_utils, fetch_creds import os import yaml # Load config file with open(cfg_file, 'r') as f: cfg_dict = yaml.load(f) # Init variables bucket_name = cfg_dict["bucket_name"] bucket_out_prefix = cfg_dict["bucket_out_prefix"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) output_dir = cfg_dict['output_directory'] # And upload data upl_files = [] for root, dirs, files in os.walk(output_dir): if files: upl_files.extend([os.path.join(root, fil) for fil in files]) # Using CPAC AWS utils s3_upl_files = [ufile.replace(output_dir, bucket_out_prefix) \ for ufile in upl_files] aws_utils.s3_upload(bucket, upl_files, s3_upl_files)
def upl_qap_output(cfg_file): ''' ''' # Import packages from CPAC.AWS import aws_utils, fetch_creds import os import yaml # Load config file with open(cfg_file,'r') as f: cfg_dict = yaml.load(f) # Init variables bucket_name = cfg_dict["bucket_name"] bucket_out_prefix = cfg_dict["bucket_out_prefix"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) output_dir = cfg_dict['output_directory'] # And upload data upl_files = [] for root, dirs, files in os.walk(output_dir): if files: upl_files.extend([os.path.join(root, fil) for fil in files]) # Using CPAC AWS utils s3_upl_files = [ufile.replace(output_dir, bucket_out_prefix) \ for ufile in upl_files] aws_utils.s3_upload(bucket, upl_files, s3_upl_files)
def main(index, local_dir): ''' Function to download an anatomical dataset from S3 and process it through Freesurfer's recon-all command, then upload the data back to S3 Parameters ---------- index : integer the index of the subject to process local_dir : string filepath to the local directory to store the input and processed outputs ''' # Import packages import boto import logging import os import subprocess from CPAC.AWS import aws_utils, fetch_creds # Init variables creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' bucket = fetch_creds.return_bucket(creds_path, 'fcp-indi') prefix = 'data/Projects/CORR/RawData/IBA_TRT/' dl_dir = os.path.join(local_dir, 'inputs') subjects_dir = os.path.join(local_dir, 'subjects') # Setup logger fs_log_path = os.path.join(local_dir, 'download_run_fs_%d.log' % index) fs_log = setup_logger('fs_log', fs_log_path, logging.INFO, to_screen=True) # Make input and subject dirs if not os.path.exists(dl_dir): os.makedirs(dl_dir) if not os.path.exists(subjects_dir): os.makedirs(subjects_dir) # Get S3 anatomical paths dictionary anat_dict = return_anat_dict(bucket, prefix) # Get list of unique subject ids to download key_list = sorted(anat_dict.keys()) # Extract subject of interest subj_id = key_list[index] s3_path = anat_dict[subj_id] # Download data fs_log.info('Downloading %s...' % s3_path) s3_key = bucket.get_key(s3_path) s3_filename = os.path.basename(s3_path) dl_filename = os.path.join(dl_dir, subj_id, s3_filename) # Make folders if need be dl_dirs = os.path.dirname(dl_filename) if not os.path.exists(dl_dirs): os.makedirs(dl_dirs) s3_key.get_contents_to_filename(dl_filename) # Execute recon-all cmd_list = ['recon-all', '-openmp', '4', '-i', dl_filename, '-subjid', subj_id, '-qcache', '-all'] cmd_str = ' '.join(cmd_list) fs_log.info('Executing %s...' % cmd_str) # Use subprocess to send command and communicate outputs proc = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Stream output while proc.poll() is None: stdout_line = proc.stdout.readline() fs_log.info(stdout_line) proc.wait() # Gather processed data fs_log.info('Gathering outputs for upload to S3...') upl_list = [] subj_dir = os.path.join(subjects_dir, subj_id) for root, dirs, files in os.walk(subj_dir): if files: upl_list.extend([os.path.join(root, fl) for fl in files]) # Update log with upload info fs_log.info('Gathered %d files for upload to S3' % len(upl_list)) # Build upload list upl_prefix = os.path.join(prefix.replace('RawData', 'Outputs'), 'freesurfer', subj_id) s3_upl_list = [upl.replace(subj_dir, upl_prefix) for upl in upl_list] # Upload to S3 aws_utils.s3_upload(bucket, upl_list, s3_upl_list, overwrite=True, make_public=True)
fixed = os.path.join(tmp, 'fixed', s3_prefix.split('/')[-2]) orig = os.path.join(tmp, 'orig', s3_prefix.split('/')[-2]) if not os.path.exists(fixed): os.makedirs(fixed) if not os.path.exists(orig): os.makedirs(orig) # Fetch 4 participants from the BIDS dataset and download to a temporary directory. # Start by fetching all keys. bucket = fetch_creds.return_bucket(creds, 'fcp-indi') key_list = [] for i, k in enumerate(bucket.list(prefix=s3_prefix)): if 'participants.tsv' in str(k.name): key_list.append(str(k.name)) # Download the files. aws_utils.s3_download(bucket, key_list, orig, bucket_prefix=s3_prefix) # Change NaNs to 'n/a'. df = pd.read_csv(os.path.join(orig, 'participants.tsv'), sep='\t') df.to_csv(os.path.join(fixed, 'participants.tsv'), sep='\t', na_rep='n/a', header=True, index=False) aws_utils.s3_upload(bucket, [os.path.join(fixed, 'participants.tsv')], ['/'.join([s3_prefix, 'participants.tsv'])], make_public=True, overwrite=True)
aws_utils.s3_download( bucket, filestopull, './', bucket_prefix='data/Projects/ACPI/Outputs/') except: print "DL Falied, Trying Again" tarname = strat + '_' + tarlist[0] + '_' + tarlist[-1] print 'Tarring', tarlist, tarname fo.write(tarname + '\n') tar = tarfile.open(tarname + '.tar.gz', 'w:gz') tar.add(strat + '/') tar.close() shutil.rmtree(strat) aws_utils.s3_upload( bucket, [tarname + '.tar.gz'], ['data/Projects/ACPI/OutputTars/' + tarname + '.tar.gz']) os.remove(tarname + '.tar.gz') tarlist = [] elif subsize_gb > 3.2: nextlist = [] print 'TOOBIG', tarlist, subsize_gb while subsize_gb > 3.2: nextlist.append(tarlist[-1]) del tarlist[-1] subsize_gb = 0 for subtar in tarlist: subsize_gb += sum([ stratdict[strat][subtar][f]['size'] for f in stratdict[strat][subtar].keys()
def upload_dir_contents(ipdir, s3path, bucketname, creds): srclist = [os.path.abspath(g) for g in glob.glob(ipdir + '/*')] destlist = [s3path + '/' + s.split('/')[-1] for s in srclist] bucket = fetch_creds.return_bucket(creds, bucketname) aws_utils.s3_upload(bucket, srclist, destlist)
for idx, perp in enumerate(perps): if 'group_%d' % group_idx not in groups.keys(): groups['group_%d' % group_idx] = [] groups['group_%d' % group_idx].append(perp) # print groups['group_%d' % group_idx] if idx % 3 == 0: group_idx += 1 for group in groups: groupfiles = [] for perp in groups[group]: groupfiles.extend( [fname for fname in release_files[release] if perp in fname]) # Make relative paths in tar. os.chdir(warehouse_dir) for name in groupfiles: print '%s will be added to %s' % (name.replace( warehouse_dir + '/', './'), os.path.join(outdir, group + '.tar.gz')) if not os.path.isfile(os.path.join(outdir, group + '.tar.gz')): with tarfile.open(os.path.join(outdir, group + '.tar.gz'), "w:gz") as tgz: for name in groupfiles: tgz.add(name.replace(warehouse_dir + '/', './')) aws_utils.s3_upload( bucket, [os.path.join(outdir, group + '.tar.gz')], [ 'data/Projects/RocklandSample/RawDataTars/' + release_folders[release] + '/' + group + '.tar.gz' ], make_public=True) os.remove(os.path.join(outdir, group + '.tar.gz'))
def main(index, local_dir): ''' Function to download an anatomical dataset from S3 and process it through ANTS antsCorticalThickness.sh script, then upload the data back to S3 Parameters ---------- index : integer the index of the subject to process local_dir : string filepath to the local directory to store the input and processed outputs ''' # Import packages import boto import logging import os import subprocess import time from CPAC.AWS import aws_utils, fetch_creds # Init variables creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' # Oasis template paths oasis_path = '/home/ubuntu/OASIS-30_Atropos_template/' # Bucket and S3 dataset prefix bucket = fetch_creds.return_bucket(creds_path, 'fcp-indi') prefix = 'data/Projects/CORR/RawData/IBA_TRT/' # Local dirs for working and download dl_dir = os.path.join(local_dir, 'inputs') # Setup logger act_log_path = '/home/ubuntu/run_act_%d.log' % index act_log = setup_logger('act_log', act_log_path, logging.INFO, to_screen=True) # Make input and workdirs if not os.path.exists(dl_dir): os.makedirs(dl_dir) # Get S3 anatomical paths dictionary anat_dict = return_anat_dict(bucket, prefix) # Get lis of unique subject ids to download key_list = sorted(anat_dict.keys()) # Extract subject of interest subj_id = key_list[index] s3_path = anat_dict[subj_id] # Init working dir working_dir = os.path.join(local_dir, '%s_act_workdir' % subj_id) if not os.path.exists(working_dir): os.makedirs(working_dir) # Download data act_log.info('Downloading %s...' % s3_path) s3_key = bucket.get_key(s3_path) s3_filename = os.path.basename(s3_path) dl_filename = os.path.join(dl_dir, subj_id, s3_filename) # Make folders if need be dl_dirs = os.path.dirname(dl_filename) if not os.path.exists(dl_dirs): os.makedirs(dl_dirs) s3_key.get_contents_to_filename(dl_filename) # Create the nipype workflow act_wf = create_workflow(working_dir, dl_filename, oasis_path) # Run the workflow act_log.info('Running the workflow...') # Start timing start = time.time() act_wf.run() # Finish timing fin = time.time() act_log.info('Completed workflow!') # Log finish and total computation time elapsed = (fin - start)/60.0 act_log.info('Total time running is: %f minutes' % elapsed) # Gather processed data act_log.info('Gathering outputs for upload to S3...') upl_list = [] for root, dirs, files in os.walk(working_dir): if files: upl_list.extend([os.path.join(root, fl) for fl in files]) # Update log with upload info act_log.info('Gathered %d files for upload to S3' % len(upl_list)) # Build upload list upl_prefix = os.path.join(prefix.replace('RawData', 'Outputs'), 'ants', subj_id) s3_upl_list = [upl.replace(working_dir, upl_prefix) for upl in upl_list] # Upload to S3 aws_utils.s3_upload(bucket, upl_list, s3_upl_list)
def upload_dir_contents(ipdir,s3path, bucketname, creds): srclist=[os.path.abspath(g) for g in glob.glob(ipdir+'/*')] destlist=[s3path+'/'+s.split('/')[-1] for s in srclist] bucket=fetch_creds.return_bucket(creds, bucketname) aws_utils.s3_upload(bucket,srclist,destlist)
def main(sub_idx): # Init variables bucket_name = 'fcp-indi' bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun' config_file = '/home/ubuntu/abide_run/settings/pipeline_config_abide_rerun.yml' creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' local_prefix = '/mnt/eigen_run' sublist_file = '/home/ubuntu/abide_run/eig-subs1.yml' # Pull in bucket, config, and subject sublist = yaml.load(open(sublist_file, 'r')) subject = sublist[sub_idx] sub_id = subject.split('_')[-1] bucket = fetch_creds.return_bucket(creds_path, bucket_name) c = Configuration(yaml.load(open(config_file, 'r'))) # Test to see if theyre already upload to_do = True if to_do: ## Collect functional_mni list from S3 bucket filt_global = 'pipeline_abide_rerun__freq-filter/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/_bandpass_freqs_0.01.0.1/bandpassed_demeaned_filtered_antswarp.nii.gz' % sub_id filt_noglobal = filt_global.replace('global1', 'global0') nofilt_global = 'pipeline_abide_rerun/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/residual_antswarp.nii.gz' % sub_id nofilt_noglobal = nofilt_global.replace('global1', 'global0') s3_functional_mni_list = [ filt_global, filt_noglobal, nofilt_global, nofilt_noglobal ] s3_functional_mni_list = [ os.path.join(bucket_prefix, s) for s in s3_functional_mni_list ] # Download contents to local inputs directory try: aws_utils.s3_download(bucket, s3_functional_mni_list, local_prefix=os.path.join( local_prefix, 'centrality_inputs'), bucket_prefix=bucket_prefix) except Exception as e: print 'Unable to find eigenvector centrality inputs for subject %s, skipping...' % sub_id print 'Error: %s' % e return # Build strat dict (dictionary of strategies and local input paths) strat_dict = { 'filt_global': os.path.join(local_prefix, 'centrality_inputs', filt_global), 'filt_noglobal': os.path.join(local_prefix, 'centrality_inputs', filt_noglobal), 'nofilt_noglobal': os.path.join(local_prefix, 'centrality_inputs', nofilt_noglobal), 'nofilt_global': os.path.join(local_prefix, 'centrality_inputs', nofilt_global) } # Create list of processes proc_list = [ Process(target=make_workflow, args=(in_name, strat, sub_id, c, local_prefix)) for strat, in_name in strat_dict.items() ] # Iterate through processes and fire off for p in proc_list: p.start() for p in proc_list: if p.is_alive(): p.join() # Gather outputs wfs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) local_list = [] for wf in wfs: for root, dirs, files in os.walk(wf): if files: local_list.extend([os.path.join(root, f) for f in files]) s3_list = [ loc.replace( local_prefix, 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_eigen' ) for loc in local_list ] aws_utils.s3_upload(bucket, local_list, s3_list) # And delete working directories try: for input_file in strat_dict.values(): print 'removing input file %s...' % input_file os.remove(input_file % sub_id) except Exception as e: print 'Unable to remove input files' print 'Error: %s' % e work_dirs = glob.glob( os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) for work_dir in work_dirs: print 'removing %s...' % work_dir shutil.rmtree(work_dir) else: print 'subject %s already processed and uploaded, skipping...' % sub_id
def main(sub_idx): # Init variables bucket_name = 'fcp-indi' bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun' config_file = '/home/ubuntu/abide_run/settings/pipeline_config_abide_rerun.yml' creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' local_prefix = '/mnt/eigen_run' sublist_file = '/home/ubuntu/abide_run/eig-subs1.yml' # Pull in bucket, config, and subject sublist = yaml.load(open(sublist_file, 'r')) subject = sublist[sub_idx] sub_id = subject.split('_')[-1] bucket = fetch_creds.return_bucket(creds_path, bucket_name) c = Configuration(yaml.load(open(config_file, 'r'))) # Test to see if theyre already upload to_do = True if to_do: ## Collect functional_mni list from S3 bucket filt_global = 'pipeline_abide_rerun__freq-filter/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/_bandpass_freqs_0.01.0.1/bandpassed_demeaned_filtered_antswarp.nii.gz' % sub_id filt_noglobal = filt_global.replace('global1','global0') nofilt_global = 'pipeline_abide_rerun/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/residual_antswarp.nii.gz' % sub_id nofilt_noglobal = nofilt_global.replace('global1','global0') s3_functional_mni_list = [filt_global, filt_noglobal, nofilt_global, nofilt_noglobal] s3_functional_mni_list = [os.path.join(bucket_prefix, s) for s in s3_functional_mni_list] # Download contents to local inputs directory try: aws_utils.s3_download(bucket, s3_functional_mni_list, local_prefix=os.path.join(local_prefix, 'centrality_inputs'), bucket_prefix=bucket_prefix) except Exception as e: print 'Unable to find eigenvector centrality inputs for subject %s, skipping...' % sub_id print 'Error: %s' % e return # Build strat dict (dictionary of strategies and local input paths) strat_dict = {'filt_global' : os.path.join(local_prefix, 'centrality_inputs', filt_global), 'filt_noglobal' : os.path.join(local_prefix, 'centrality_inputs', filt_noglobal), 'nofilt_noglobal' : os.path.join(local_prefix, 'centrality_inputs', nofilt_noglobal), 'nofilt_global' : os.path.join(local_prefix, 'centrality_inputs', nofilt_global)} # Create list of processes proc_list = [Process(target=make_workflow, args=(in_name, strat, sub_id, c, local_prefix)) for strat, in_name in strat_dict.items()] # Iterate through processes and fire off for p in proc_list: p.start() for p in proc_list: if p.is_alive(): p.join() # Gather outputs wfs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) local_list = [] for wf in wfs: for root, dirs, files in os.walk(wf): if files: local_list.extend([os.path.join(root, f) for f in files]) s3_list = [loc.replace(local_prefix, 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_eigen') for loc in local_list] aws_utils.s3_upload(bucket, local_list, s3_list) # And delete working directories try: for input_file in strat_dict.values(): print 'removing input file %s...' % input_file os.remove(input_file % sub_id) except Exception as e: print 'Unable to remove input files' print 'Error: %s' %e work_dirs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) for work_dir in work_dirs: print 'removing %s...' % work_dir shutil.rmtree(work_dir) else: print 'subject %s already processed and uploaded, skipping...' % sub_id
localname = k.name.replace(bucketpath, localpath) if os.path.isfile(localname): localname = os.path.abspath(localname) while os.path.islink(localname): localname = os.readlink(localname) x = os.popen('md5sum ' + localname).read() localmd5 = str(x.split(' ')[0]) etag = str(k.etag).replace('"', '') if '-' in etag: numparts = int(etag.split('-')[-1]) #print (os.stat(localname).st_size/(1024.0*1024.0))/numparts y = os.popen('bash s3etag.sh ' + localname + ' 8').read() localetag = y.strip().split(' ')[-1] if etag == localetag: pass #print 'all good',buckname elif etag != localetag: print 'no bueno', buckname, localetag, etag aws_utils.s3_upload(bucket, [localname], [buckname], make_public=True, overwrite=True) elif '-' not in etag and localmd5 == etag: pass #print 'all good',buckname elif '-' not in etag and localmd5 != etag: print 'no bueno', buckname, localmd5, etag aws_utils.s3_upload(bucket, [localname], [buckname], make_public=True, overwrite=True) else: print 'not found locally', buckname