def upl_qap_output(cfg_file):
    '''
    '''

    # Import packages
    from CPAC.AWS import aws_utils, fetch_creds
    import os
    import yaml

    # Load config file
    with open(cfg_file,'r') as f:
        cfg_dict = yaml.load(f)

    # Init variables
    bucket_name = cfg_dict["bucket_name"]
    bucket_out_prefix = cfg_dict["bucket_out_prefix"]
    creds_path = cfg_dict["creds_path"]
    
    bucket = fetch_creds.return_bucket(creds_path, bucket_name)
        
    output_dir = cfg_dict['output_directory']

    # And upload data
    upl_files = []
    for root, dirs, files in os.walk(output_dir):
        if files:
            upl_files.extend([os.path.join(root, fil) for fil in files])

    # Using CPAC AWS utils
    s3_upl_files = [ufile.replace(output_dir, bucket_out_prefix) \
                   for ufile in upl_files]
    aws_utils.s3_upload(bucket, upl_files, s3_upl_files)
Esempio n. 2
0
def upl_qap_output(cfg_file):
    '''
    '''

    # Import packages
    from CPAC.AWS import aws_utils, fetch_creds
    import os
    import yaml

    # Load config file
    with open(cfg_file, 'r') as f:
        cfg_dict = yaml.load(f)

    # Init variables
    bucket_name = cfg_dict["bucket_name"]
    bucket_out_prefix = cfg_dict["bucket_out_prefix"]
    creds_path = cfg_dict["creds_path"]

    bucket = fetch_creds.return_bucket(creds_path, bucket_name)

    output_dir = cfg_dict['output_directory']

    # And upload data
    upl_files = []
    for root, dirs, files in os.walk(output_dir):
        if files:
            upl_files.extend([os.path.join(root, fil) for fil in files])

    # Using CPAC AWS utils
    s3_upl_files = [ufile.replace(output_dir, bucket_out_prefix) \
                   for ufile in upl_files]
    aws_utils.s3_upload(bucket, upl_files, s3_upl_files)
Esempio n. 3
0
def download_outputs(path_prefix, creds_path, bucket_name, qap_type, \
                          download_to):

    import pickle
    from CPAC.AWS import fetch_creds
    from CPAC.AWS.aws_utils import s3_download

    src_list = []

    bucket = fetch_creds.return_bucket(creds_path, bucket_name)

    if qap_type == "anat_spatial":
        search_for = "anatomical_spatial"
    elif qap_type == "func_spatial":
        search_for = "functional_spatial"
    elif qap_type == "func_temporal":
        search_for = "functional_temporal"

    for k in bucket.list(prefix=path_prefix):

        k_name = str(k.name)

        if (search_for in k_name) and (".csv" in k_name):

            src_list.append(k_name)

    s3_download(bucket, src_list, download_to)
def download_outputs(path_prefix, creds_path, bucket_name, qap_type, \
                          download_to):
                          
    import pickle
    from CPAC.AWS import fetch_creds
    from CPAC.AWS.aws_utils import s3_download

    src_list = []

    bucket = fetch_creds.return_bucket(creds_path, bucket_name)


    if qap_type == "anat_spatial":
        search_for = "anatomical_spatial"
    elif qap_type == "func_spatial":
        search_for = "functional_spatial"
    elif qap_type == "func_temporal":
        search_for = "functional_temporal"


    for k in bucket.list(prefix=path_prefix):

        k_name = str(k.name)
    
        if (search_for in k_name) and (".csv" in k_name):
    
            src_list.append(k_name)
        
 
    s3_download(bucket, src_list, download_to)
Esempio n. 5
0
    def test_return_bucket(self):
        '''
        Method to test the fetch_creds.return_bucket() function

        Parameters
        ----------
        self : FetchCredsTestCase
            a unittest.TestCase-inherited class

        Returns
        -------
        None
            this function does not return any values, but tests to make
            sure the fetch_creds.return_bucket() function returns a
            bucket object
        '''

        # Import packages
        import boto.s3

        # Init variables
        err_msg = 'Unable to get the S3 bucket because of faulty AWS '\
                  'credentials or boto package not found'

        # Grab the AWS bucket
        bucket = fetch_creds.return_bucket(self.creds_path,
                                           self.bucket_name)

        # Assert that it is a boto bucket object
        self.assertIsInstance(bucket, boto.s3.bucket.Bucket,
                              msg=err_msg)
Esempio n. 6
0
    def test_return_bucket(self):
        '''
        Method to test the fetch_creds.return_bucket() function

        Parameters
        ----------
        self : FetchCredsTestCase
            a unittest.TestCase-inherited class

        Returns
        -------
        None
            this function does not return any values, but tests to make
            sure the fetch_creds.return_bucket() function returns a
            bucket object
        '''

        # Import packages
        import boto.s3

        # Init variables
        err_msg = 'Unable to get the S3 bucket because of faulty AWS '\
                  'credentials or boto package not found'

        # Grab the AWS bucket
        bucket = fetch_creds.return_bucket(self.creds_path, self.bucket_name)

        # Assert that it is a boto bucket object
        self.assertIsInstance(bucket, boto.s3.bucket.Bucket, msg=err_msg)
def dl_subj_from_s3(subj_idx, cfg_file, s3_dict_yaml):
    '''
    '''

    # Import packages
    from CPAC.AWS import fetch_creds, aws_utils
    import yaml

    # Load config file
    with open(cfg_file,'r') as f:
        cfg_dict = yaml.load(f)

    # Init variables
    bucket_prefix = cfg_dict["bucket_prefix"]
    local_prefix = cfg_dict["local_prefix"]
    bucket_name = cfg_dict["bucket_name"]
    creds_path = cfg_dict["creds_path"]
    
    bucket = fetch_creds.return_bucket(creds_path, bucket_name)

    s3_list = []
    s3_dict = {}


    # pull in S3 dict yaml
    with open(s3_dict_yaml,'r') as f:
        s3_dict = yaml.load(f)

    if len(s3_dict) == 0:
        err = "\n[!] Filepaths have not been successfully gathered from " \
              "the filepath YAML dictionary!\n"
        raise Exception(err)


    # Get list of subject keys for indexing
    sd_keys = s3_dict.keys()
    sd_keys.sort()

    # Grab subject dictionary of interest
    subj_key = sd_keys[subj_idx-1]
    sub_dict = s3_dict[subj_key]

    # Download subject data to local prefix
    s3_dl = []
    for s3_key, s3_path in sub_dict.items():
        s3_dl.append(s3_path)
        sub_dict[s3_key] = s3_path.replace(bucket_prefix, local_prefix)

    aws_utils.s3_download(bucket, s3_dl, local_prefix=local_prefix, \
                              bucket_prefix=bucket_prefix)

    sub_dict = {subj_key : sub_dict}

    # Return single subject dictionary
    return sub_dict
Esempio n. 8
0
def dl_subj_from_s3(subj_idx, cfg_file, s3_dict_yaml):
    '''
    '''

    # Import packages
    from CPAC.AWS import fetch_creds, aws_utils
    import yaml

    # Load config file
    with open(cfg_file, 'r') as f:
        cfg_dict = yaml.load(f)

    # Init variables
    bucket_prefix = cfg_dict["bucket_prefix"]
    local_prefix = cfg_dict["local_prefix"]
    bucket_name = cfg_dict["bucket_name"]
    creds_path = cfg_dict["creds_path"]

    bucket = fetch_creds.return_bucket(creds_path, bucket_name)

    s3_list = []
    s3_dict = {}

    # pull in S3 dict yaml
    with open(s3_dict_yaml, 'r') as f:
        s3_dict = yaml.load(f)

    if len(s3_dict) == 0:
        err = "\n[!] Filepaths have not been successfully gathered from " \
              "the filepath YAML dictionary!\n"
        raise Exception(err)

    # Get list of subject keys for indexing
    sd_keys = s3_dict.keys()
    sd_keys.sort()

    # Grab subject dictionary of interest
    subj_key = sd_keys[subj_idx - 1]
    sub_dict = s3_dict[subj_key]

    # Download subject data to local prefix
    s3_dl = []
    for s3_key, s3_path in sub_dict.items():
        s3_dl.append(s3_path)
        sub_dict[s3_key] = s3_path.replace(bucket_prefix, local_prefix)

    aws_utils.s3_download(bucket, s3_dl, local_prefix=local_prefix, \
                              bucket_prefix=bucket_prefix)

    sub_dict = {subj_key: sub_dict}

    # Return single subject dictionary
    return sub_dict
Esempio n. 9
0
def test_bucket_access(creds_path, output_directory, subject_id):
    '''
    '''

    # Import packages
    import os
    import botocore.exceptions as bexc
    from CPAC.AWS import fetch_creds

    # Init variables
    s3_str = 's3://'
    test_file = '/tmp/test-output.txt'

    # Explicitly lower-case the "s3"
    if output_directory.lower().startswith(s3_str):
        out_dir_sp = output_directory.split('/')
        out_dir_sp[0] = out_dir_sp[0].lower()
        output_directory = '/'.join(out_dir_sp)

    # Get bucket name
    bucket_name = output_directory.replace(s3_str, '').split('/')[0]

    # Get bucket
    bucket = fetch_creds.return_bucket(creds_path, bucket_name)

    # Create local file
    with open(test_file, 'w') as f:
        f.write('test123')
    f.close()

    # Formulate test ouput key in bucket path output directory
    rel_key_path = output_directory.replace(\
                   os.path.join(s3_str, bucket_name), '').lstrip('/')
    write_test_key = os.path.join(rel_key_path,
                                  'test-output_%s.txt' % subject_id)

    # Attempt a write to bucket
    try:
        bucket.upload_file(test_file, write_test_key)
        print 'Confirmed S3 write access for CPAC output!'
        test_key = bucket.Object(key=write_test_key)
        test_key.delete()
        s3_write_access = True
    # Otherwise we set the access flag to false
    except bexc.ClientError:
        s3_write_access = False

    # Return the access flag
    return s3_write_access
Esempio n. 10
0
def s3_download(files_list, local_dir):
    '''
    '''

    # Import packages
    import boto
    import os
    from CPAC.AWS import fetch_creds

    # Init variables
    local_list = []
    bucket = fetch_creds.return_bucket(
        'fcp-indi', '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv')

    # Pull file keys
    for img_file in files_list:
        # Get file key on S3
        s3_key = bucket.get_key(img_file)
        key_name = str(s3_key.name)

        # Get local name
        key_name_dash = key_name.replace('/', '-')
        local_name = key_name_dash.replace(
            'data-Projects-ABIDE_Initiative-Outputs-', local_dir)

        # Check dirs and make dirs
        dirs_name = os.path.dirname(local_name)
        if not os.path.exists(dirs_name):
            os.makedirs(dirs_name)

        # Download data
        print 'Saving %s to %s...' % (key_name, local_name)
        s3_key.get_contents_to_filename(local_name)

        # Append local list
        local_list.append(local_name)

    # Return local list
    return local_list
def s3_download(files_list, local_dir):
    '''
    '''

    # Import packages
    import boto
    import os
    from CPAC.AWS import fetch_creds

    # Init variables
    local_list = []
    bucket = fetch_creds.return_bucket('fcp-indi', '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv')

    # Pull file keys
    for img_file in files_list:
        # Get file key on S3
        s3_key = bucket.get_key(img_file)
        key_name = str(s3_key.name)

        # Get local name
        key_name_dash = key_name.replace('/', '-')
        local_name = key_name_dash.replace('data-Projects-ABIDE_Initiative-Outputs-',
                                      local_dir)

        # Check dirs and make dirs
        dirs_name = os.path.dirname(local_name)
        if not os.path.exists(dirs_name):
            os.makedirs(dirs_name)

        # Download data
        print 'Saving %s to %s...' % (key_name, local_name)
        s3_key.get_contents_to_filename(local_name)

        # Append local list
        local_list.append(local_name)

    # Return local list
    return local_list
def pull_S3_sublist(yaml_outpath, img_type, bucket_name, bucket_prefix, creds_path):

    import os
    from CPAC.AWS import fetch_creds
    import yaml

    s3_list = []
    s3_dict = {}

    bucket = fetch_creds.return_bucket(creds_path, bucket_name)

    # Filter for anat/rest
    if img_type == 'anat':
        subkey_type = 'anatomical_scan'
    elif img_type == 'rest':
        subkey_type = 'functional_scan'


    # Build S3-subjects to download
    for bk in bucket.list(prefix=bucket_prefix):
        s3_list.append(str(bk.name))

    # Build dictionary of filepaths
    for sfile in s3_list:

        ssplit = sfile.split('/')

        sub_id = ssplit[-4]

        session_id = ssplit[-3]

        scan_id = ssplit[-2]

        if img_type in scan_id:
             
            # this ONLY handles raw data inputs, not CPAC-generated outputs!
            if not s3_dict.has_key((sub_id, session_id, scan_id)):

                resource_dict = {}
                resource_dict[subkey_type] = sfile

                s3_dict[(sub_id, session_id, scan_id)] = {}
                s3_dict[(sub_id, session_id, scan_id)].update(resource_dict)

            else:

                s3_dict[(sub_id, session_id, scan_id)].update(resource_dict)         

        else:

            continue
    
            
    if len(s3_dict) == 0:
        err = "\n[!] Filepaths have not been successfully gathered from " \
              "the S3 bucket!\n"
        raise Exception(err)

    dict_len = len(s3_dict)            
           
    # write yaml file
    with open(yaml_outpath,"wt") as f:
        f.write(yaml.dump(s3_dict))
        
    
    if os.path.isfile(yaml_outpath):
        print "\nS3 dictionary file successfully created: %s\n" % yaml_outpath
        print "Total number of subject-session-scans: %d\n" % dict_len
    else:
        err = "\n[!] Filepaths from the S3 bucket have not been " \
              "successfully saved to the YAML file!\nOutput filepath: %s\n" \
              % yaml_outpath
        raise Exception(err)
Esempio n. 13
0
import sys, os, glob
from CPAC.AWS import aws_utils, fetch_creds

# For checking file integrity between local and upload files Currently only unix compatible
# Local and uploaded directories must have same file structure
# Example: python s3md5sumcheck.py ~/keys-format.csv fcp-indi data/Projects/ABIDE2/RawData/ /home/data/Incoming/abide2/bids_conv/bidsorg/

awscreds = sys.argv[1]
bucketname = sys.argv[2]
bucketpath = sys.argv[3]
localpath = sys.argv[4]
if len(sys.argv) >= 6:
    replace = sys.argv[5]
else:
    replace = None
bucket = fetch_creds.return_bucket(awscreds, bucketname)

for k in bucket.list(prefix=bucketpath):
    buckname = k.name
    print k.name
    localname = k.name.replace(bucketpath, localpath)
    if os.path.isfile(localname):
        localname = os.path.abspath(localname)
        while os.path.islink(localname):
            localname = os.readlink(localname)
        x = os.popen('md5sum ' + localname).read()
        localmd5 = str(x.split(' ')[0])
        etag = str(k.etag).replace('"', '')
        if '-' in etag:
            numparts = int(etag.split('-')[-1])
            #print (os.stat(localname).st_size/(1024.0*1024.0))/numparts
def main(index, local_dir):
    '''
    Function to download an anatomical dataset from S3 and process it
    through ANTS antsCorticalThickness.sh script, then upload the data back
    to S3

    Parameters
    ----------
    index : integer
        the index of the subject to process
    local_dir : string
        filepath to the local directory to store the input and
        processed outputs
    '''

    # Import packages
    import boto
    import logging
    import os
    import subprocess
    import time
    from CPAC.AWS import aws_utils, fetch_creds

    # Init variables
    creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv'
    # Oasis template paths
    oasis_path = '/home/ubuntu/OASIS-30_Atropos_template/'
    # Bucket and S3 dataset prefix
    bucket = fetch_creds.return_bucket(creds_path, 'fcp-indi')
    prefix = 'data/Projects/CORR/RawData/IBA_TRT/'
    # Local dirs for working and download
    dl_dir = os.path.join(local_dir, 'inputs')

    # Setup logger
    act_log_path = '/home/ubuntu/run_act_%d.log' % index
    act_log = setup_logger('act_log', act_log_path, logging.INFO, to_screen=True)

    # Make input and workdirs
    if not os.path.exists(dl_dir):
        os.makedirs(dl_dir)

    # Get S3 anatomical paths dictionary
    anat_dict = return_anat_dict(bucket, prefix)

    # Get lis of unique subject ids to download
    key_list = sorted(anat_dict.keys())

    # Extract subject of interest
    subj_id = key_list[index]
    s3_path = anat_dict[subj_id]

    # Init working dir
    working_dir = os.path.join(local_dir, '%s_act_workdir' % subj_id)
    if not os.path.exists(working_dir):
        os.makedirs(working_dir)

    # Download data
    act_log.info('Downloading %s...' % s3_path)
    s3_key = bucket.get_key(s3_path)
    s3_filename = os.path.basename(s3_path)
    dl_filename = os.path.join(dl_dir, subj_id, s3_filename)

    # Make folders if need be
    dl_dirs = os.path.dirname(dl_filename)
    if not os.path.exists(dl_dirs):
        os.makedirs(dl_dirs)
    s3_key.get_contents_to_filename(dl_filename)

    # Create the nipype workflow
    act_wf = create_workflow(working_dir, dl_filename, oasis_path)

    # Run the workflow
    act_log.info('Running the workflow...')
    # Start timing
    start = time.time()
    act_wf.run()
    # Finish timing
    fin = time.time()
    act_log.info('Completed workflow!')

    # Log finish and total computation time
    elapsed = (fin - start)/60.0
    act_log.info('Total time running is: %f minutes' % elapsed)

    # Gather processed data
    act_log.info('Gathering outputs for upload to S3...')
    upl_list = []
    for root, dirs, files in os.walk(working_dir):
        if files:
            upl_list.extend([os.path.join(root, fl) for fl in files])
    # Update log with upload info
    act_log.info('Gathered %d files for upload to S3' % len(upl_list))

    # Build upload list
    upl_prefix = os.path.join(prefix.replace('RawData', 'Outputs'),
                              'ants', subj_id)
    s3_upl_list = [upl.replace(working_dir, upl_prefix) for upl in upl_list]

    # Upload to S3
    aws_utils.s3_upload(bucket, upl_list, s3_upl_list)
def main():
    '''
    This function runs the main routine
    '''
    # Import packages
    from CPAC.AWS import fetch_creds
    import os
    import yaml

    # Init variables
    creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv'
    bucket = fetch_creds.return_bucket('fcp-indi', creds_path)
    bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun'
    sub_fp = '/home/ubuntu/abide/preprocessing/yamls/subs_list.yml'
    sub_list = yaml.load(open(sub_fp, 'r'))
    example_subid = '0050002_session_1'

    # Populate list of files to link to
    #src_list = []
    #src_list = gather_files_tosort(src_list, bucket, bucket_prefix)

    # Derivatives dictionary {name: (no_files_per_strategy, filt_str)}
    strat_dict = {'nofilt_noglobal' : ['pipeline_abide_rerun', 'global0'],
                  'nofilt_global' : ['pipeline_abide_rerun', 'global1'],
                  'filt_noglobal' : ['pipeline_abide_rerun__freq-filter', 'global0'],
                  'filt_global' : ['pipeline_abide_rerun__freq-filter', 'global1']}

    derivs_dict = {'alff' : (1, 'alff_to_standard_smooth', 'nii.gz'),
                   'degree_binarize' : (1, 'centrality_outputs_smoothed', 'degree_centrality_binarize'),
                   'degree_weighted' : (1, 'centrality_outputs_smoothed', 'degree_centrality_weighted'),
                   'dual_regression' : (1, 'dr_tempreg_maps_zstat_stack_to_standard_smooth', 'nii.gz'),
                   'eigenvector_binarize' : (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_binarize'),
                   'eigenvector_weighted' : (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_weighted'),
                   'falff' : (1, 'falff_to_standard_smooth', 'nii.gz'),
                   'func_mask' : (1, 'functional_brain_mask_to_standard', 'nii.gz'),
                   'func_mean' : (1, 'mean_functional_in_mni', 'nii.gz'),
                   'func_preproc' : (1, 'functional_mni', '.nii.gz'),
                   'lfcd' : (1, 'centrality_outputs_smoothed', 'lfcd_binarize'),
                   'reho' : (1, 'reho_to_standard_smooth', 'nii.gz'),
                   'rois_aal' : (4, 'roi_timeseries', 'aal'),
                   'rois_cc200' : (4, 'roi_timeseries', 'CC200'),
                   'rois_cc400' : (4, 'roi_timeseries', 'CC400'),
                   'rois_dosenbach160' : (4, 'roi_timeseries', 'rois_3mm'),
                   'rois_ez' : (4, 'roi_timeseries', 'ez'),
                   'rois_ho' : (4, 'roi_timeseries', 'ho_'),
                   'rois_tt' : (4, 'roi_timeseries', 'tt'),
                   'vmhc' : (1, 'vmhc_fisher_zstd_zstat_map', 'nii.gz')}

    # Create error and output dictionaries
    out_dict = {k : {kk : [] for kk in derivs_dict.keys()} for k in strat_dict.keys()}
    err_dict = {k : {kk : [] for kk in derivs_dict.keys()} for k in strat_dict.keys()}

    # Iterate through strategies
    for strat, filts in strat_dict.items():
        print 'building %s...' % strat
        filt = filts[0]
        g_sig = filts[1]
        strat_prefix = os.path.join(bucket_prefix, filt, example_subid)
        # Iterate through derivatives
        for deriv, v in derivs_dict.items():
            num_files = v[0]
            deriv_folder = v[1]
            name_filter = v[2]
            deriv_prefix = os.path.join(strat_prefix, deriv_folder)
            keys_list = []
            for key in bucket.list(prefix=deriv_prefix):
                k_name = str(key.name)
                # If global signal regression was used or didnt need to be
                if (g_sig in k_name or 'global' not in k_name) and \
                        name_filter in k_name:
                    keys_list.append(k_name)
            # Grab only wanted results from keys
            if len(keys_list) == num_files:
                out_dict[strat][deriv] = [k for k in keys_list if '.nii.gz' in k or '.1D' in k][0]
            else:
                err_dict[strat][deriv] = keys_list
                print 'error in number of files!'

    # Go through dictionary and build paths
    mapping_dict = {}
    s = 1
    # For each subject
    for sub in sub_list:
        subid = sub.split('_')[-1] + '_session_1'
        print 'populating %s...%d' % (subid, s)
        # For each strategy
        for strat, deriv_dict in out_dict.items():
            strat_prefix = os.path.join(bucket_prefix, strat)
            # For each derivative, generate src and dst filepaths
            d = 0
            for deriv, filepath in deriv_dict.items():
                deriv_prefix = os.path.join(strat_prefix, deriv, sub + '_' + deriv)
                # Check extensions
                if filepath.endswith('.nii.gz'):
                    dst_path = deriv_prefix + '.nii.gz'
                elif filepath.endswith('.1D'):
                    dst_path = deriv_prefix + '.1D'
                else:
                    raise Exception('Bad extension type')
                # Get sub id from filepath
                src_path = filepath.replace(example_subid, subid)
                mapping_dict[src_path] = dst_path
                d += 1
            if d != 20:
                print d
                raw_input('not enough dervivs')
        s += 1

    # Return
    return out_dict, err_dict, mapping_dict
def pull_S3_sublist(yaml_outpath, img_type, bucket_name, bucket_prefix,
                    creds_path):

    import os
    from CPAC.AWS import fetch_creds
    import yaml

    s3_list = []
    s3_dict = {}

    bucket = fetch_creds.return_bucket(creds_path, bucket_name)

    # Filter for anat/rest
    if img_type == 'anat':
        subkey_type = 'anatomical_scan'
    elif img_type == 'rest':
        subkey_type = 'functional_scan'

    # Build S3-subjects to download
    for bk in bucket.list(prefix=bucket_prefix):
        s3_list.append(str(bk.name))

    # Build dictionary of filepaths
    for sfile in s3_list:

        ssplit = sfile.split('/')

        sub_id = ssplit[-4]

        session_id = ssplit[-3]

        scan_id = ssplit[-2]

        if img_type in scan_id:

            # this ONLY handles raw data inputs, not CPAC-generated outputs!
            if not s3_dict.has_key((sub_id, session_id, scan_id)):

                resource_dict = {}
                resource_dict[subkey_type] = sfile

                s3_dict[(sub_id, session_id, scan_id)] = {}
                s3_dict[(sub_id, session_id, scan_id)].update(resource_dict)

            else:

                s3_dict[(sub_id, session_id, scan_id)].update(resource_dict)

        else:

            continue

    if len(s3_dict) == 0:
        err = "\n[!] Filepaths have not been successfully gathered from " \
              "the S3 bucket!\n"
        raise Exception(err)

    dict_len = len(s3_dict)

    # write yaml file
    with open(yaml_outpath, "wt") as f:
        f.write(yaml.dump(s3_dict))

    if os.path.isfile(yaml_outpath):
        print "\nS3 dictionary file successfully created: %s\n" % yaml_outpath
        print "Total number of subject-session-scans: %d\n" % dict_len
    else:
        err = "\n[!] Filepaths from the S3 bucket have not been " \
              "successfully saved to the YAML file!\nOutput filepath: %s\n" \
              % yaml_outpath
        raise Exception(err)
from CPAC.AWS import aws_utils, fetch_creds
import os
import re

bucket = fetch_creds.return_bucket(
    '/home/jpellman/jpellman-fcp-indi-keys_oldfmt.csv', 'fcp-indi')

srclist = []
for i, k in enumerate(bucket.list(prefix='data/Projects/ADHD200/RawData')):
    srclist.append(k.name)
    print k.name

srclist = sorted(srclist)
#niis = [os.path.basename(src) for src in srclist if '.nii.gz' in src]
#print set(niis)

matchdct={
    "anat" : [r"(.+)/([0-9]+)/session_([0-9]+)/anat_([0-9]{1,2})/mprage.nii.gz" , r"\1/sub-\2/ses-\3/anat/sub-\2_ses-\3_run-\4_T1w.nii.gz"] , \
    "func" : [r"(.+)/([0-9]+)/session_([0-9]+)/rest_([0-9]{1,2})/rest.nii.gz" , r"\1/sub-\2/ses-\3/func/sub-\2_ses-\3_task-rest_run-\4_bold.nii.gz"] \
}

srclist_filt = []
destlist = []

for sl in sorted(srclist):
    if re.match(matchdct['anat'][0], sl):
        subbed = re.sub(matchdct['anat'][0], matchdct['anat'][1], sl)
    elif re.match(matchdct['func'][0], sl):
        subbed = re.sub(matchdct['func'][0], matchdct['func'][1], sl)
    else:
        continue
def cpac_sge_logstats(s3_prefix, str_filt, creds_path, bucket_name):
    '''
    '''

    # Import packages
    from CPAC.AWS import fetch_creds, aws_utils
    import os
    import numpy as np
    import yaml

    # Init variables
    bucket = fetch_creds.return_bucket(creds_path, bucket_name)
    log_keys = []
    log_pass = {}
    log_fail = []

    # Get the log file keys
    print 'Finding log S3 keys...'
    for key in bucket.list(prefix=s3_prefix):
        if str_filt in str(key.name):
            log_keys.append(key)

    # Get only tasks that finished
    print 'Searching for complete CPAC runs and getting runtimes...'
    for idx, key in enumerate(log_keys):
        kname = str(key.name)
        # Get log contents as a string in memory
        log_str = key.get_contents_as_string()

        # If it passed cpac running without crashing
        if 'CPAC run complete' in log_str:
            cpac_pass = True
        else:
            cpac_pass = False

        # Split log strings into list
        log_str = log_str.split('\n')

        # If it has 'End' at the end, it ran without crashing
        if 'End' in log_str[-2] and cpac_pass:
            # Get runtimes
            cpac_time, upl_time, num_files, subj = get_cpac_runtimes(log_str)
            log_pass[subj] = (cpac_time, upl_time, num_files)
        else:
            log_fail.append(kname)

        # Update status
        print '%.3f%% complete' % (100*(float(idx)/len(log_keys)))

    # Get stats
    num_subs_pass = len(log_pass)
    num_subs_fail = len(log_fail)

    cpac_times = {sub : times[0] for sub, times in log_pass.items()}
    cpac_mean = np.mean(cpac_times.values())

    upl_times = {sub : times[1] for sub, times in log_pass.items()}
    upl_mean = np.mean(upl_times.values())

    # Save times as yamls
    with open(os.path.join(os.getcwd(), 'cpac_times.yml'), 'w') as f:
        f.write(yaml.dump(cpac_times))
    with open(os.path.join(os.getcwd(), 'upl_times.yml'), 'w') as f:
        f.write(yaml.dump(upl_times))
    with open(os.path.join(os.getcwd(), 'fail_logs.yml'), 'w') as f:
        f.write(yaml.dump(log_fail))

    # Print report
    print 'Number of subjects passed: %d' % len(log_pass)
    print 'Number of subjects failed: %d' % len(log_fail)
    print 'Average CPAC run time: %.3f minutes' % cpac_mean
    print 'Average upload time: %.3f minutes' % upl_mean

    # Return variables
    return cpac_times, upl_times
Esempio n. 19
0
def upload_dir_contents(ipdir,s3path, bucketname, creds):
    srclist=[os.path.abspath(g) for g in glob.glob(ipdir+'/*')]
    destlist=[s3path+'/'+s.split('/')[-1] for s in srclist]
    bucket=fetch_creds.return_bucket(creds, bucketname)
    aws_utils.s3_upload(bucket,srclist,destlist)
def main(index, local_dir):
    '''
    Function to download an anatomical dataset from S3 and process it
    through Freesurfer's recon-all command, then upload the data back
    to S3

    Parameters
    ----------
    index : integer
        the index of the subject to process
    local_dir : string
        filepath to the local directory to store the input and
        processed outputs
    '''

    # Import packages
    import boto
    import logging
    import os
    import subprocess
    from CPAC.AWS import aws_utils, fetch_creds
    import pycuda.autoinit
    import pycuda.driver as cuda
    from multiprocessing import Process

    # Init variables
    creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv'
    bucket = fetch_creds.return_bucket(creds_path, 'fcp-indi')
    prefix = 'data/Projects/CORR/RawData/IBA_TRT/'
    dl_dir = os.path.join(local_dir, 'inputs')
    subjects_dir = os.path.join(local_dir, 'subjects')

    # Setup logger
    fs_log_path = os.path.join(local_dir, 'download_run_fs_%d.log' % index)
    fs_log = setup_logger('fs_log', fs_log_path, logging.INFO, to_screen=True)

    # Make input and subject dirs
    if not os.path.exists(dl_dir):
        os.makedirs(dl_dir)

    if not os.path.exists(subjects_dir):
        os.makedirs(subjects_dir)

    # Get S3 anatomical paths dictionary
    anat_dict = return_anat_dict(bucket, prefix)

    # Get list of unique subject ids to download
    key_list = sorted(anat_dict.keys())

    # Determine number of GPUs
    num_gpus=cuda.Device.count()
    subj_id=[]
    proc=[]
    upload_proc=[]
    for inst in range(num_gpus):

        # Set environment variable.
        os.environ['FREESURFER_CUDA_DEVICE'] = str(inst) 
        # Get the index of the subject to be run.
        subj_index = num_gpus*index + inst
        # Extract subject of interest
        subj_id.append(key_list[subj_index])
        s3_path = anat_dict[subj_id[inst]]

        # Download data
        fs_log.info('Downloading %s...' % s3_path)
        s3_key = bucket.get_key(s3_path)
        s3_filename = os.path.basename(s3_path)
        dl_filename = os.path.join(dl_dir, subj_id[inst], s3_filename)
        # Make folders if need be
        dl_dirs = os.path.dirname(dl_filename)
        if not os.path.exists(dl_dirs):
            os.makedirs(dl_dirs)
        s3_key.get_contents_to_filename(dl_filename)

        # Execute recon-all
        cmd_list = ['recon-all', '-use_gpu','-openmp','8', '-time', '-qcache',
                    '-i', dl_filename, '-subjid', subj_id[inst], '-all']
        cmd_str = ' '.join(cmd_list)
        fs_log.info('Executing %s...' % cmd_str)
        # Use subprocess to send command and communicate outputs
        proc.append(subprocess.Popen(cmd_list))

    # Run uploads with multiprocessing's Process
    for inst in range(0,num_gpus):
        proc[inst].wait()

        # Gather processed data
        fs_log.info('Gathering outputs for upload to S3...')
        upl_list = []
        subj_dir = os.path.join(subjects_dir, subj_id[inst])
        for root, dirs, files in os.walk(subj_dir):
            if files:
                upl_list.extend([os.path.join(root, fl) for fl in files])
        # Update log with upload info
        fs_log.info('Gathered %d files for upload to S3' % len(upl_list))

        # Build upload list
        upl_prefix = os.path.join(prefix.replace('RawData', 'Outputs'),
                                  'freesurfer_gpu', subj_id[inst])
        s3_upl_list = [upl.replace(subj_dir, upl_prefix) for upl in upl_list]

        # Upload to S3
        upload_proc.append(Process(target=aws_utils.s3_upload,
                                   args=(bucket, upl_list, s3_upl_list)))
        upload_proc[inst].start()

    # Check that uploading has finished. 
    for inst in range(0,num_gpus):
        upload_proc[inst].join()
    print 'Usage: %s <path to AWS creds> <temporary directory> <S3 prefix to BIDS base>' % sys.argv[
        0]
    sys.exit(1)
creds = sys.argv[1]
tmp = sys.argv[2]
s3_prefix = sys.argv[3]

# Assumes last character in s3_prefix is a slash.
if s3_prefix[-1] != '/':
    s3_prefix += '/'

tmp = os.path.join(tmp, s3_prefix.split('/')[-2])

# Fetch 4 participants from the BIDS dataset and download to a temporary directory.
# Start by fetching all keys.
bucket = fetch_creds.return_bucket(creds, 'fcp-indi')
key_list = []
for i, k in enumerate(bucket.list(prefix=s3_prefix)):
    key_list.append(str(k.name).replace(s3_prefix, ''))

# Fetch all unique participant codes.
participants = [k.split('/')[0] for k in key_list if 'sub-' in k]
participants = sorted(list(set(participants)))
participants = participants[0:4]

downloads_list = [
    os.path.join(s3_prefix, k) for k in key_list
    if ('sub-' in k and k.split('/')[0] in participants) or ('sub-' not in k)
]

# Download the files.
Esempio n. 22
0
from CPAC.AWS import aws_utils, fetch_creds
import tarfile
import os
import shutil
import re
import sys

keyspath=sys.argv[1]

bucket = fetch_creds.return_bucket(keyspath, 'fcp-indi')


#Be sure to put in the last forward slash as may act as wildcard otherwise
ipdir='data/Projects/CORR/RawData/'
opdir='data/Projects/CORR/RawDataBIDs/'

srclist=[]
for i,k in enumerate(bucket.list(prefix=ipdir)):
    srclist.append(k.name)
    print k.name

srclist=sorted(srclist)

matchdct={
'anat' : 
["(.+)/([0-9]+)/session_([0-9]{1,2})/anat_([0-9]{1,2})/anat.nii.gz" ,
r"\1/sub-\2/ses-\3/anat/sub-\2_ses-\3_run-\4_T1w.nii.gz"],

#'mpi_anat_comp': 
#[r"(.+)/([0-9]+)/session_([0-9]{1,2})/anat_([0-9]{1,2})/anat_([a-z12\_]+).nii.gz" , 
#r"\1/sub-\2/ses-\3/anat/sub-\2_ses-\3_acq-\5_run-\4_T1w.nii.gz"],
def main(sub_idx):

    # Init variables
    bucket_name = 'fcp-indi'
    bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun'
    config_file = '/home/ubuntu/abide_run/settings/pipeline_config_abide_rerun.yml'
    creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv'
    local_prefix = '/mnt/eigen_run'
    sublist_file = '/home/ubuntu/abide_run/eig-subs1.yml'

    # Pull in bucket, config, and subject
    sublist = yaml.load(open(sublist_file, 'r'))
    subject = sublist[sub_idx]
    sub_id = subject.split('_')[-1]
    bucket = fetch_creds.return_bucket(creds_path, bucket_name)
    c = Configuration(yaml.load(open(config_file, 'r')))

    # Test to see if theyre already upload
    to_do = True

    if to_do:
        ## Collect functional_mni list from S3 bucket
        filt_global = 'pipeline_abide_rerun__freq-filter/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/_bandpass_freqs_0.01.0.1/bandpassed_demeaned_filtered_antswarp.nii.gz' % sub_id
        filt_noglobal = filt_global.replace('global1','global0')
        nofilt_global = 'pipeline_abide_rerun/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/residual_antswarp.nii.gz' % sub_id
        nofilt_noglobal = nofilt_global.replace('global1','global0')
        s3_functional_mni_list = [filt_global, filt_noglobal, nofilt_global, nofilt_noglobal]
        s3_functional_mni_list = [os.path.join(bucket_prefix, s) for s in s3_functional_mni_list]

        # Download contents to local inputs directory
        try:
            aws_utils.s3_download(bucket, s3_functional_mni_list, local_prefix=os.path.join(local_prefix, 'centrality_inputs'), bucket_prefix=bucket_prefix)
        except Exception as e:
            print 'Unable to find eigenvector centrality inputs for subject %s, skipping...' % sub_id
            print 'Error: %s' % e
            return

        # Build strat dict (dictionary of strategies and local input paths)
        strat_dict = {'filt_global' : os.path.join(local_prefix, 'centrality_inputs', filt_global),
                      'filt_noglobal' : os.path.join(local_prefix, 'centrality_inputs', filt_noglobal),
                      'nofilt_noglobal' : os.path.join(local_prefix, 'centrality_inputs', nofilt_noglobal),
                      'nofilt_global' : os.path.join(local_prefix, 'centrality_inputs', nofilt_global)}

        # Create list of processes
        proc_list = [Process(target=make_workflow, args=(in_name, strat, sub_id, c, local_prefix)) for strat, in_name in strat_dict.items()]

        # Iterate through processes and fire off
        for p in proc_list:
            p.start()

        for p in proc_list:
            if p.is_alive():
                p.join()

        # Gather outputs
        wfs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id))
        local_list = []
        for wf in wfs:
            for root, dirs, files in os.walk(wf):
                if files:
                    local_list.extend([os.path.join(root, f) for f in files])

        s3_list = [loc.replace(local_prefix, 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_eigen') for loc in local_list]

        aws_utils.s3_upload(bucket, local_list, s3_list)

        # And delete working directories
        try:
            for input_file in strat_dict.values():
                print 'removing input file %s...' % input_file
                os.remove(input_file % sub_id)
        except Exception as e:
            print 'Unable to remove input files'
            print 'Error: %s' %e

        work_dirs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id))

        for work_dir in work_dirs:
            print 'removing %s...' % work_dir
            shutil.rmtree(work_dir)
    else:
        print 'subject %s already processed and uploaded, skipping...' % sub_id
Esempio n. 24
0
def upload_dir_contents(ipdir, s3path, bucketname, creds):
    srclist = [os.path.abspath(g) for g in glob.glob(ipdir + '/*')]
    destlist = [s3path + '/' + s.split('/')[-1] for s in srclist]
    bucket = fetch_creds.return_bucket(creds, bucketname)
    aws_utils.s3_upload(bucket, srclist, destlist)
Esempio n. 25
0
def pull_S3_sublist(yaml_outpath, img_type, cfg_file):

    # function example use:
    #
    # yamlpath = os.path.join(os.getcwd(), "s3dict.yml")
    #
    # # Build entire filepath dictionary from S3
    # s3_dict_yml = pull_S3_sublist(yamlpath, 'anat', args.config)

    import os
    from CPAC.AWS import fetch_creds
    import yaml

    s3_list = []
    s3_dict = {}

    # Load config file
    with open(cfg_file, 'r') as f:
        cfg_dict = yaml.load(f)

    bucket_name = cfg_dict["bucket_name"]
    bucket_prefix = cfg_dict["bucket_prefix"]
    creds_path = cfg_dict["creds_path"]

    bucket = fetch_creds.return_bucket(creds_path, bucket_name)

    # Filter for anat/rest
    if img_type == 'anat':
        subkey_type = 'anatomical_scan'
    elif img_type == 'rest':
        subkey_type = 'functional_scan'

    # Build S3-subjects to download
    for bk in bucket.list(prefix=bucket_prefix):
        s3_list.append(str(bk.name))

    # Build dictionary of filepaths
    for sfile in s3_list:

        ssplit = sfile.split('/')

        sub_id = ssplit[-4]

        session_id = ssplit[-3]

        scan_id = ssplit[-2]

        if img_type in scan_id:

            # this ONLY handles raw data inputs, not CPAC-generated outputs!
            if not s3_dict.has_key((sub_id, session_id, scan_id)):

                resource_dict = {}
                resource_dict[subkey_type] = sfile

                s3_dict[(sub_id, session_id, scan_id)] = {}
                s3_dict[(sub_id, session_id, scan_id)].update(resource_dict)

            else:

                s3_dict[(sub_id, session_id, scan_id)].update(resource_dict)

        else:

            continue

    if len(s3_dict) == 0:
        err = "\n[!] Filepaths have not been successfully gathered from " \
              "the S3 bucket!\n"
        raise Exception(err)

    # write yaml file
    with open(yaml_outpath, "wt") as f:
        f.write(yaml.dump(s3_dict))

    if os.path.isfile(yaml_outpath):
        return yaml_outpath
    else:
        err = "\n[!] Filepaths from the S3 bucket have not been " \
              "successfully saved to the YAML file!\nOutput filepath: %s\n" \
              % yaml_outpath
        raise Exception(err)
Esempio n. 26
0
def main():
    '''
    This function runs the main routine
    '''
    # Import packages
    from CPAC.AWS import fetch_creds
    import os
    import yaml

    # Init variables
    creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv'
    bucket = fetch_creds.return_bucket('fcp-indi', creds_path)
    bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun'
    sub_fp = '/home/ubuntu/abide/preprocessing/yamls/subs_list.yml'
    sub_list = yaml.load(open(sub_fp, 'r'))
    example_subid = '0050002_session_1'

    # Populate list of files to link to
    #src_list = []
    #src_list = gather_files_tosort(src_list, bucket, bucket_prefix)

    # Derivatives dictionary {name: (no_files_per_strategy, filt_str)}
    strat_dict = {
        'nofilt_noglobal': ['pipeline_abide_rerun', 'global0'],
        'nofilt_global': ['pipeline_abide_rerun', 'global1'],
        'filt_noglobal': ['pipeline_abide_rerun__freq-filter', 'global0'],
        'filt_global': ['pipeline_abide_rerun__freq-filter', 'global1']
    }

    derivs_dict = {
        'alff': (1, 'alff_to_standard_smooth', 'nii.gz'),
        'degree_binarize':
        (1, 'centrality_outputs_smoothed', 'degree_centrality_binarize'),
        'degree_weighted':
        (1, 'centrality_outputs_smoothed', 'degree_centrality_weighted'),
        'dual_regression':
        (1, 'dr_tempreg_maps_zstat_stack_to_standard_smooth', 'nii.gz'),
        'eigenvector_binarize':
        (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_binarize'),
        'eigenvector_weighted': (1, 'centrality_outputs_smoothed',
                                 'eigenvector_centrality_weighted'),
        'falff': (1, 'falff_to_standard_smooth', 'nii.gz'),
        'func_mask': (1, 'functional_brain_mask_to_standard', 'nii.gz'),
        'func_mean': (1, 'mean_functional_in_mni', 'nii.gz'),
        'func_preproc': (1, 'functional_mni', '.nii.gz'),
        'lfcd': (1, 'centrality_outputs_smoothed', 'lfcd_binarize'),
        'reho': (1, 'reho_to_standard_smooth', 'nii.gz'),
        'rois_aal': (4, 'roi_timeseries', 'aal'),
        'rois_cc200': (4, 'roi_timeseries', 'CC200'),
        'rois_cc400': (4, 'roi_timeseries', 'CC400'),
        'rois_dosenbach160': (4, 'roi_timeseries', 'rois_3mm'),
        'rois_ez': (4, 'roi_timeseries', 'ez'),
        'rois_ho': (4, 'roi_timeseries', 'ho_'),
        'rois_tt': (4, 'roi_timeseries', 'tt'),
        'vmhc': (1, 'vmhc_fisher_zstd_zstat_map', 'nii.gz')
    }

    # Create error and output dictionaries
    out_dict = {
        k: {kk: []
            for kk in derivs_dict.keys()}
        for k in strat_dict.keys()
    }
    err_dict = {
        k: {kk: []
            for kk in derivs_dict.keys()}
        for k in strat_dict.keys()
    }

    # Iterate through strategies
    for strat, filts in strat_dict.items():
        print 'building %s...' % strat
        filt = filts[0]
        g_sig = filts[1]
        strat_prefix = os.path.join(bucket_prefix, filt, example_subid)
        # Iterate through derivatives
        for deriv, v in derivs_dict.items():
            num_files = v[0]
            deriv_folder = v[1]
            name_filter = v[2]
            deriv_prefix = os.path.join(strat_prefix, deriv_folder)
            keys_list = []
            for key in bucket.list(prefix=deriv_prefix):
                k_name = str(key.name)
                # If global signal regression was used or didnt need to be
                if (g_sig in k_name or 'global' not in k_name) and \
                        name_filter in k_name:
                    keys_list.append(k_name)
            # Grab only wanted results from keys
            if len(keys_list) == num_files:
                out_dict[strat][deriv] = [
                    k for k in keys_list if '.nii.gz' in k or '.1D' in k
                ][0]
            else:
                err_dict[strat][deriv] = keys_list
                print 'error in number of files!'

    # Go through dictionary and build paths
    mapping_dict = {}
    s = 1
    # For each subject
    for sub in sub_list:
        subid = sub.split('_')[-1] + '_session_1'
        print 'populating %s...%d' % (subid, s)
        # For each strategy
        for strat, deriv_dict in out_dict.items():
            strat_prefix = os.path.join(bucket_prefix, strat)
            # For each derivative, generate src and dst filepaths
            d = 0
            for deriv, filepath in deriv_dict.items():
                deriv_prefix = os.path.join(strat_prefix, deriv,
                                            sub + '_' + deriv)
                # Check extensions
                if filepath.endswith('.nii.gz'):
                    dst_path = deriv_prefix + '.nii.gz'
                elif filepath.endswith('.1D'):
                    dst_path = deriv_prefix + '.1D'
                else:
                    raise Exception('Bad extension type')
                # Get sub id from filepath
                src_path = filepath.replace(example_subid, subid)
                mapping_dict[src_path] = dst_path
                d += 1
            if d != 20:
                print d
                raw_input('not enough dervivs')
        s += 1

    # Return
    return out_dict, err_dict, mapping_dict
Esempio n. 27
0
def s3_match_and_move(keyspath, matchdct, ipdir, opdir, dryrun):
    '''
    A function to match, and rename or move keys in an S3 bucket
    using regular expressions
    '''

    bucket = fetch_creds.return_bucket(keyspath, 'fcp-indi')

    fo = open('wrongetags.csv', 'a')
    fo.write('src,dest\n')
    fo.close()

    srclist = []

    files_converted = []
    destlist_tot = []

    for i, k in enumerate(bucket.list(prefix=ipdir)):
        srclist.append(k.name)
        #print k.name

    srclist = sorted(srclist)

    for mk in sorted(matchdct.keys()):
        print mk
        print matchdct[mk]['match']
        fo = open('wrongetags.csv', 'a')
        srclist_filt = []
        destlist = []

        for sl in srclist:

            if 'include' in matchdct[mk].keys():

                if re.search(matchdct[mk]['match'][0], sl) and any(
                        m in sl for m in matchdct[mk]['include']):
                    #print sl,re.sub(matchdct[mk][0],matchdct[mk][1],sl)
                    srclist_filt.append(sl)
                    destlist.append(
                        re.sub(matchdct[mk]['match'][0],
                               matchdct[mk]['match'][1],
                               sl).replace(ipdir, opdir))
            else:
                if re.search(matchdct[mk]['match'][0], sl):
                    #print sl,re.sub(matchdct[mk][0],matchdct[mk][1],sl)
                    srclist_filt.append(sl)
                    destlist.append(
                        re.sub(matchdct[mk]['match'][0],
                               matchdct[mk]['match'][1],
                               sl).replace(ipdir, opdir))

        if len(destlist) != len(set(destlist)):
            raise Exception('Duplicate Destination Filepaths exist')

        files_converted = files_converted + srclist_filt
        destlist_tot = destlist_tot + destlist

        if dryrun == 'yes':
            for j, slf in enumerate(srclist_filt):
                if bucket.get_key(destlist[j]):
                    dx = bucket.get_key(destlist[j])
                    sx = bucket.get_key(srclist_filt[j])
                    if dx.etag != sx.etag:
                        print '###### wrong etag ##### changing: ', srclist_filt[
                            j], destlist[j]

                        fo.write(srclist_filt[j] + ',' + destlist[j] + '\n')

                    else:
                        pass  #print 'Already Exists and same etag: ',srclist_filt[j],destlist[j]
                #else:
                #    print 'copying ',srclist_filt[j],destlist[j]
        else:
            # Note might error with make_public=True, removing it stops error, unsure why error occurs
            aws_utils.s3_rename(bucket,
                                srclist_filt,
                                destlist,
                                keep_old=True,
                                make_public=True,
                                overwrite=True)
        fo.close()

    print 'num files pulled in:', len(
        files_converted), 'num files produced', len(destlist_tot)

    if len(files_converted) != len(destlist_tot):
        raise Exception(
            'There is a mismatch in the total files read in, and total files produced'
        )
Esempio n. 28
0
from CPAC.AWS import aws_utils, fetch_creds
import tarfile
import os
import shutil

bucket = fetch_creds.return_bucket('/home/ubuntu/doconnor-fcp-indi-keys.csv',
                                   'fcp-indi')

src_list = []
for i, k in enumerate(bucket.list(prefix='data/Projects/ACPI/Outputs/')):
    src_list.append([str(k.name), k.size])

subids = sorted(set([sl[0].split('/')[5].split('-')[0] for sl in src_list]))
strats = sorted(set([sl[0].split('/')[4] for sl in src_list]))
strats = strats[3:]

stratdict = {}
for strat in strats:
    stratdict[strat] = {}
    subdict = {}
    for subid in subids:
        subdict[subid] = {}
        for i, src_file in enumerate(sorted(src_list)):
            if (subid in src_file[0]) and (strat in src_file[0]):
                nme = src_file[0]
                sze = src_file[1]
                propdict = {}
                bits = str(nme).split('/')
                filename = bits[-1]
                propdict['name'] = nme
                propdict['size'] = sze
def pull_S3_sublist(yaml_outpath, img_type, cfg_file):

    # function example use:
    #
    # yamlpath = os.path.join(os.getcwd(), "s3dict.yml")
    #
    # # Build entire filepath dictionary from S3
    # s3_dict_yml = pull_S3_sublist(yamlpath, 'anat', args.config)

    import os
    from CPAC.AWS import fetch_creds
    import yaml

    s3_list = []
    s3_dict = {}

    # Load config file
    with open(cfg_file,'r') as f:
        cfg_dict = yaml.load(f)

    bucket_name = cfg_dict["bucket_name"]
    bucket_prefix = cfg_dict["bucket_prefix"]
    creds_path = cfg_dict["creds_path"]

    bucket = fetch_creds.return_bucket(creds_path, bucket_name)


    # Filter for anat/rest
    if img_type == 'anat':
        subkey_type = 'anatomical_scan'
    elif img_type == 'rest':
        subkey_type = 'functional_scan'


    # Build S3-subjects to download
    for bk in bucket.list(prefix=bucket_prefix):
        s3_list.append(str(bk.name))

    # Build dictionary of filepaths
    for sfile in s3_list:

        ssplit = sfile.split('/')

        sub_id = ssplit[-4]

        session_id = ssplit[-3]

        scan_id = ssplit[-2]

        if img_type in scan_id:
             
            # this ONLY handles raw data inputs, not CPAC-generated outputs!
            if not s3_dict.has_key((sub_id, session_id, scan_id)):

                resource_dict = {}
                resource_dict[subkey_type] = sfile

                s3_dict[(sub_id, session_id, scan_id)] = {}
                s3_dict[(sub_id, session_id, scan_id)].update(resource_dict)

            else:

                s3_dict[(sub_id, session_id, scan_id)].update(resource_dict)         

        else:

            continue
    
            
    if len(s3_dict) == 0:
        err = "\n[!] Filepaths have not been successfully gathered from " \
              "the S3 bucket!\n"
        raise Exception(err)
            
           
    # write yaml file
    with open(yaml_outpath,"wt") as f:
        f.write(yaml.dump(s3_dict))
        
    
    if os.path.isfile(yaml_outpath):
        return yaml_outpath
    else:
        err = "\n[!] Filepaths from the S3 bucket have not been " \
              "successfully saved to the YAML file!\nOutput filepath: %s\n" \
              % yaml_outpath
        raise Exception(err)
def main(index, local_dir):
    '''
    Function to download an anatomical dataset from S3 and process it
    through Freesurfer's recon-all command, then upload the data back
    to S3

    Parameters
    ----------
    index : integer
        the index of the subject to process
    local_dir : string
        filepath to the local directory to store the input and
        processed outputs
    '''

    # Import packages
    import boto
    import logging
    import os
    import subprocess
    from CPAC.AWS import aws_utils, fetch_creds

    # Init variables
    creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv'
    bucket = fetch_creds.return_bucket(creds_path, 'fcp-indi')
    prefix = 'data/Projects/CORR/RawData/IBA_TRT/'
    dl_dir = os.path.join(local_dir, 'inputs')
    subjects_dir = os.path.join(local_dir, 'subjects')

    # Setup logger
    fs_log_path = os.path.join(local_dir, 'download_run_fs_%d.log' % index)
    fs_log = setup_logger('fs_log', fs_log_path, logging.INFO, to_screen=True)

    # Make input and subject dirs
    if not os.path.exists(dl_dir):
        os.makedirs(dl_dir)

    if not os.path.exists(subjects_dir):
        os.makedirs(subjects_dir)

    # Get S3 anatomical paths dictionary
    anat_dict = return_anat_dict(bucket, prefix)

    # Get list of unique subject ids to download
    key_list = sorted(anat_dict.keys())

    # Extract subject of interest
    subj_id = key_list[index]
    s3_path = anat_dict[subj_id]

    # Download data
    fs_log.info('Downloading %s...' % s3_path)
    s3_key = bucket.get_key(s3_path)
    s3_filename = os.path.basename(s3_path)
    dl_filename = os.path.join(dl_dir, subj_id, s3_filename)

    # Make folders if need be
    dl_dirs = os.path.dirname(dl_filename)
    if not os.path.exists(dl_dirs):
        os.makedirs(dl_dirs)
    s3_key.get_contents_to_filename(dl_filename)

    # Execute recon-all
    cmd_list = ['recon-all', '-openmp', '4', '-i', dl_filename,
                '-subjid', subj_id, '-qcache', '-all']
    cmd_str = ' '.join(cmd_list)
    fs_log.info('Executing %s...' % cmd_str)
    # Use subprocess to send command and communicate outputs
    proc = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    # Stream output
    while proc.poll() is None:
        stdout_line = proc.stdout.readline()
        fs_log.info(stdout_line)

    proc.wait()

    # Gather processed data
    fs_log.info('Gathering outputs for upload to S3...')
    upl_list = []
    subj_dir = os.path.join(subjects_dir, subj_id)
    for root, dirs, files in os.walk(subj_dir):
        if files:
            upl_list.extend([os.path.join(root, fl) for fl in files])
    # Update log with upload info
    fs_log.info('Gathered %d files for upload to S3' % len(upl_list))

    # Build upload list
    upl_prefix = os.path.join(prefix.replace('RawData', 'Outputs'),
                              'freesurfer', subj_id)
    s3_upl_list = [upl.replace(subj_dir, upl_prefix) for upl in upl_list]

    # Upload to S3
    aws_utils.s3_upload(bucket, upl_list, s3_upl_list, overwrite=True, make_public=True)
Esempio n. 31
0
def main(sub_idx):

    # Init variables
    bucket_name = 'fcp-indi'
    bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun'
    config_file = '/home/ubuntu/abide_run/settings/pipeline_config_abide_rerun.yml'
    creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv'
    local_prefix = '/mnt/eigen_run'
    sublist_file = '/home/ubuntu/abide_run/eig-subs1.yml'

    # Pull in bucket, config, and subject
    sublist = yaml.load(open(sublist_file, 'r'))
    subject = sublist[sub_idx]
    sub_id = subject.split('_')[-1]
    bucket = fetch_creds.return_bucket(creds_path, bucket_name)
    c = Configuration(yaml.load(open(config_file, 'r')))

    # Test to see if theyre already upload
    to_do = True

    if to_do:
        ## Collect functional_mni list from S3 bucket
        filt_global = 'pipeline_abide_rerun__freq-filter/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/_bandpass_freqs_0.01.0.1/bandpassed_demeaned_filtered_antswarp.nii.gz' % sub_id
        filt_noglobal = filt_global.replace('global1', 'global0')
        nofilt_global = 'pipeline_abide_rerun/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/residual_antswarp.nii.gz' % sub_id
        nofilt_noglobal = nofilt_global.replace('global1', 'global0')
        s3_functional_mni_list = [
            filt_global, filt_noglobal, nofilt_global, nofilt_noglobal
        ]
        s3_functional_mni_list = [
            os.path.join(bucket_prefix, s) for s in s3_functional_mni_list
        ]

        # Download contents to local inputs directory
        try:
            aws_utils.s3_download(bucket,
                                  s3_functional_mni_list,
                                  local_prefix=os.path.join(
                                      local_prefix, 'centrality_inputs'),
                                  bucket_prefix=bucket_prefix)
        except Exception as e:
            print 'Unable to find eigenvector centrality inputs for subject %s, skipping...' % sub_id
            print 'Error: %s' % e
            return

        # Build strat dict (dictionary of strategies and local input paths)
        strat_dict = {
            'filt_global':
            os.path.join(local_prefix, 'centrality_inputs', filt_global),
            'filt_noglobal':
            os.path.join(local_prefix, 'centrality_inputs', filt_noglobal),
            'nofilt_noglobal':
            os.path.join(local_prefix, 'centrality_inputs', nofilt_noglobal),
            'nofilt_global':
            os.path.join(local_prefix, 'centrality_inputs', nofilt_global)
        }

        # Create list of processes
        proc_list = [
            Process(target=make_workflow,
                    args=(in_name, strat, sub_id, c, local_prefix))
            for strat, in_name in strat_dict.items()
        ]

        # Iterate through processes and fire off
        for p in proc_list:
            p.start()

        for p in proc_list:
            if p.is_alive():
                p.join()

        # Gather outputs
        wfs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id))
        local_list = []
        for wf in wfs:
            for root, dirs, files in os.walk(wf):
                if files:
                    local_list.extend([os.path.join(root, f) for f in files])

        s3_list = [
            loc.replace(
                local_prefix,
                'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_eigen'
            ) for loc in local_list
        ]

        aws_utils.s3_upload(bucket, local_list, s3_list)

        # And delete working directories
        try:
            for input_file in strat_dict.values():
                print 'removing input file %s...' % input_file
                os.remove(input_file % sub_id)
        except Exception as e:
            print 'Unable to remove input files'
            print 'Error: %s' % e

        work_dirs = glob.glob(
            os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id))

        for work_dir in work_dirs:
            print 'removing %s...' % work_dir
            shutil.rmtree(work_dir)
    else:
        print 'subject %s already processed and uploaded, skipping...' % sub_id