Ejemplo n.º 1
0
def run(subject_list_file, config_file=None, p_name=None, plugin=None,
        plugin_args=None, tracking=True, num_subs_at_once=None, debug=False, test_config=False):

    # Import packages
    import commands
    import os
    import pickle
    import time

    from CPAC.pipeline.cpac_pipeline import prep_workflow

    print('Run called with config file {0}'.format(config_file))

    if not config_file:
        import pkg_resources as p
        config_file = \
            p.resource_filename("CPAC",
                                os.path.join("resources",
                                             "configs",
                                             "pipeline_config_template.yml"))

    # Init variables
    sublist = None
    config_file = os.path.realpath(config_file)
    if '.yaml' in subject_list_file or '.yml' in subject_list_file:
        subject_list_file = os.path.realpath(subject_list_file)
    else:
        from CPAC.utils.bids_utils import collect_bids_files_configs, \
            bids_gen_cpac_sublist
        (file_paths, config) = collect_bids_files_configs(subject_list_file,
                                                          None)
        sublist = bids_gen_cpac_sublist(subject_list_file, file_paths,
                                        config, None)
        if not sublist:
            import sys
            print("Did not find data in {0}".format(subject_list_file))
            sys.exit(1)

    # take date+time stamp for run identification purposes
    unique_pipeline_id = strftime("%Y%m%d%H%M%S")
    pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S")

    # Load in pipeline config file
    try:
        if not os.path.exists(config_file):
            raise IOError
        else:
            c = Configuration(yaml.load(open(config_file, 'r')))
    except IOError:
        print "config file %s doesn't exist" % config_file
        raise
    except yaml.parser.ParserError as e:
        error_detail = "\"%s\" at line %d" % (
            e.problem,
            e.problem_mark.line
        )
        raise Exception(
            "Error parsing config file: {0}\n\n"
            "Error details:\n"
            "    {1}"
            "\n\n".format(config_file, error_detail)
        )
    except Exception as e:
        raise Exception(
            "Error parsing config file: {0}\n\n"
            "Error details:\n"
            "    {1}"
            "\n\n".format(config_file, e)
        )

    c.logDirectory = os.path.abspath(c.logDirectory)
    c.workingDirectory = os.path.abspath(c.workingDirectory)
    if 's3://' not in c.outputDirectory:
        c.outputDirectory = os.path.abspath(c.outputDirectory)
    c.crashLogDirectory = os.path.abspath(c.crashLogDirectory)

    if debug:
        c.write_debugging_outputs = "[1]"

    if num_subs_at_once:
        if not str(num_subs_at_once).isdigit():
            raise Exception('[!] Value entered for --num_cores not a digit.')
        c.numParticipantsAtOnce = int(num_subs_at_once)

    # Do some validation
    if not c.workingDirectory:
        raise Exception('Working directory not specified')

    if len(c.workingDirectory) > 70:
        warnings.warn("We recommend that the working directory full path "
                      "should have less then 70 characters. "
                      "Long paths might not work in your operational system.")
        warnings.warn("Current working directory: %s" % c.workingDirectory)

    # Get the pipeline name
    p_name = p_name or c.pipelineName

    # Load in subject list
    try:
        if not sublist:
            with open(subject_list_file, 'r') as sf:
                sublist = yaml.load(sf)
    except:
        print "Subject list is not in proper YAML format. Please check " \
              "your file"
        raise Exception

    # Populate subject scan map
    sub_scan_map = {}
    try:
        for sub in sublist:
            if sub['unique_id']:
                s = sub['subject_id'] + "_" + sub["unique_id"]
            else:
                s = sub['subject_id']
            scan_ids = ['scan_anat']

            if 'func' in sub:
                for id in sub['func']:
                    scan_ids.append('scan_'+ str(id))

            if 'rest' in sub:
                for id in sub['rest']:
                    scan_ids.append('scan_'+ str(id))

            sub_scan_map[s] = scan_ids
    except:
        print "\n\n" + "ERROR: Subject list file not in proper format - " \
              "check if you loaded the correct file?" + "\n" + \
              "Error name: cpac_runner_0001" + "\n\n"
        raise Exception

    pipeline_timing_info = []
    pipeline_timing_info.append(unique_pipeline_id)
    pipeline_timing_info.append(pipeline_start_stamp)
    pipeline_timing_info.append(len(sublist))

    if tracking:
        try:
            track_run(level='participant', participants=len(sublist))
        except:
            pass

    # If we're running on cluster, execute job scheduler
    if c.runOnGrid:

        # Create cluster log dir
        cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files')
        if not os.path.exists(cluster_files_dir):
            os.makedirs(cluster_files_dir)

        # Check if its a condor job, and run that
        if 'condor' in c.resourceManager.lower():
            run_condor_jobs(c, config_file, subject_list_file, p_name)
        # All other schedulers are supported
        else:
            run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir)

    # Run on one computer
    else:

        if not os.path.exists(c.workingDirectory):
            try:
                os.makedirs(c.workingDirectory)
            except:
                err = "\n\n[!] CPAC says: Could not create the working " \
                      "directory: %s\n\nMake sure you have permissions " \
                      "to write to this directory.\n\n" % c.workingDirectory
                raise Exception(err)

        # If it only allows one, run it linearly
        if c.numParticipantsAtOnce == 1:
            for sub in sublist:
                prep_workflow(sub, c, True, pipeline_timing_info,
                              p_name, plugin, plugin_args, test_config)
            return
                
        pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w')

        # Init job queue
        job_queue = []

        # Allocate processes
        processes = [
            Process(target=prep_workflow,
                    args=(sub, c, True, pipeline_timing_info,
                          p_name, plugin, plugin_args, test_config))
            for sub in sublist
        ]

        # If we're allocating more processes than are subjects, run them all
        if len(sublist) <= c.numParticipantsAtOnce:
            for p in processes:
                p.start()
                print >>pid, p.pid

        # Otherwise manage resources to run processes incrementally
        else:
            idx = 0
            while idx < len(sublist):
                # If the job queue is empty and we haven't started indexing
                if len(job_queue) == 0 and idx == 0:
                    # Init subject process index
                    idc = idx
                    # Launch processes (one for each subject)
                    for p in processes[idc: idc+c.numParticipantsAtOnce]:
                        p.start()
                        print >>pid, p.pid
                        job_queue.append(p)
                        idx += 1
                # Otherwise, jobs are running - check them
                else:
                    # Check every job in the queue's status
                    for job in job_queue:
                        # If the job is not alive
                        if not job.is_alive():
                            # Find job and delete it from queue
                            print 'found dead job ', job
                            loc = job_queue.index(job)
                            del job_queue[loc]
                            # ...and start the next available process
                            # (subject)
                            processes[idx].start()
                            # Append this to job queue and increment index
                            job_queue.append(processes[idx])
                            idx += 1
                    # Add sleep so while loop isn't consuming 100% of CPU
                    time.sleep(2)
        # Close PID txt file to indicate finish
        pid.close()
Ejemplo n.º 2
0
def run(subject_list_file,
        config_file=None,
        p_name=None,
        plugin=None,
        plugin_args=None,
        tracking=True,
        num_subs_at_once=None,
        debug=False,
        test_config=False):

    # Import packages
    import subprocess
    import os
    import pickle
    import time

    from CPAC.pipeline.cpac_pipeline import run_workflow

    print('Run called with config file {0}'.format(config_file))

    if not config_file:
        import pkg_resources as p
        config_file = \
            p.resource_filename("CPAC",
                                os.path.join("resources",
                                             "configs",
                                             "pipeline_config_template.yml"))

    # Init variables
    sublist = None
    if '.yaml' in subject_list_file or '.yml' in subject_list_file:
        subject_list_file = os.path.realpath(subject_list_file)
    else:
        from CPAC.utils.bids_utils import collect_bids_files_configs, \
            bids_gen_cpac_sublist
        (file_paths,
         config) = collect_bids_files_configs(subject_list_file, None)
        sublist = bids_gen_cpac_sublist(subject_list_file, file_paths, config,
                                        None)
        if not sublist:
            import sys
            print("Did not find data in {0}".format(subject_list_file))
            sys.exit(1)

    # take date+time stamp for run identification purposes
    unique_pipeline_id = strftime("%Y%m%d%H%M%S")
    pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S")

    # Load in pipeline config file
    config_file = os.path.realpath(config_file)
    try:
        if not os.path.exists(config_file):
            raise IOError
        else:
            c = Configuration(yaml.safe_load(open(config_file, 'r')))
    except IOError:
        print("config file %s doesn't exist" % config_file)
        raise
    except yaml.parser.ParserError as e:
        error_detail = "\"%s\" at line %d" % (e.problem, e.problem_mark.line)
        raise Exception("Error parsing config file: {0}\n\n"
                        "Error details:\n"
                        "    {1}"
                        "\n\n".format(config_file, error_detail))
    except Exception as e:
        raise Exception("Error parsing config file: {0}\n\n"
                        "Error details:\n"
                        "    {1}"
                        "\n\n".format(config_file, e))

    c.logDirectory = os.path.abspath(c.logDirectory)
    c.workingDirectory = os.path.abspath(c.workingDirectory)
    if 's3://' not in c.outputDirectory:
        c.outputDirectory = os.path.abspath(c.outputDirectory)
    c.crashLogDirectory = os.path.abspath(c.crashLogDirectory)

    if debug:
        c.write_debugging_outputs = "[1]"

    if num_subs_at_once:
        if not str(num_subs_at_once).isdigit():
            raise Exception('[!] Value entered for --num_cores not a digit.')
        c.numParticipantsAtOnce = int(num_subs_at_once)

    # Do some validation
    if not c.workingDirectory:
        raise Exception('Working directory not specified')

    if len(c.workingDirectory) > 70:
        warnings.warn("We recommend that the working directory full path "
                      "should have less then 70 characters. "
                      "Long paths might not work in your operational system.")
        warnings.warn("Current working directory: %s" % c.workingDirectory)

    # Get the pipeline name
    p_name = p_name or c.pipelineName

    # Load in subject list
    try:
        if not sublist:
            sublist = yaml.safe_load(open(subject_list_file, 'r'))
    except:
        print("Subject list is not in proper YAML format. Please check " \
              "your file")
        raise Exception

    # Populate subject scan map
    sub_scan_map = {}
    try:
        for sub in sublist:
            if sub['unique_id']:
                s = sub['subject_id'] + "_" + sub["unique_id"]
            else:
                s = sub['subject_id']
            scan_ids = ['scan_anat']

            if 'func' in sub:
                for id in sub['func']:
                    scan_ids.append('scan_' + str(id))

            if 'rest' in sub:
                for id in sub['rest']:
                    scan_ids.append('scan_' + str(id))

            sub_scan_map[s] = scan_ids
    except:
        print("\n\n" + "ERROR: Subject list file not in proper format - " \
              "check if you loaded the correct file?" + "\n" + \
              "Error name: cpac_runner_0001" + "\n\n")
        raise Exception

    pipeline_timing_info = []
    pipeline_timing_info.append(unique_pipeline_id)
    pipeline_timing_info.append(pipeline_start_stamp)
    pipeline_timing_info.append(len(sublist))

    if tracking:
        try:
            track_run(level='participant' if not test_config else 'test',
                      participants=len(sublist))
        except:
            print("Usage tracking failed for this run.")

    # If we're running on cluster, execute job scheduler
    if c.runOnGrid:

        # Create cluster log dir
        cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files')
        if not os.path.exists(cluster_files_dir):
            os.makedirs(cluster_files_dir)

        # Check if its a condor job, and run that
        if 'condor' in c.resourceManager.lower():
            run_condor_jobs(c, config_file, subject_list_file, p_name)
        # All other schedulers are supported
        else:
            run_cpac_on_cluster(config_file, subject_list_file,
                                cluster_files_dir)

    # Run on one computer
    else:
        # Create working dir
        if not os.path.exists(c.workingDirectory):
            try:
                os.makedirs(c.workingDirectory)
            except:
                err = "\n\n[!] CPAC says: Could not create the working " \
                      "directory: %s\n\nMake sure you have permissions " \
                      "to write to this directory.\n\n" % c.workingDirectory
                raise Exception(err)
        '''
        if not os.path.exists(c.logDirectory):
            try:
                os.makedirs(c.logDirectory)
            except:
                err = "\n\n[!] CPAC says: Could not create the log " \
                      "directory: %s\n\nMake sure you have permissions " \
                      "to write to this directory.\n\n" % c.logDirectory
                raise Exception(err)
        '''

        # BEGIN LONGITUDINAL TEMPLATE PIPELINE
        if hasattr(c, 'run_longitudinal') and ('anat' in c.run_longitudinal or
                                               'func' in c.run_longitudinal):
            subject_id_dict = {}
            for sub in sublist:
                if sub['subject_id'] in subject_id_dict:
                    subject_id_dict[sub['subject_id']].append(sub)
                else:
                    subject_id_dict[sub['subject_id']] = [sub]

            # subject_id_dict has the subject_id as keys and a list of sessions for
            # each participant as value
            valid_longitudinal_data = False
            for subject_id, sub_list in subject_id_dict.items():
                if len(sub_list) > 1:
                    valid_longitudinal_data = True
                    if 'func' in c.run_longitudinal:
                        raise Exception(
                            "\n\n[!] Error: Functional longitudinal pipeline is still in development and will be available in next release. Please only run anatomical longitudinal pipeline for now.\n\n"
                        )
                    if 'anat' in c.run_longitudinal:
                        strat_list = anat_longitudinal_wf(
                            subject_id, sub_list, c)
                elif len(sub_list) == 1:
                    warnings.warn(
                        "\n\nThere is only one anatomical session for sub-%s. Longitudinal preprocessing will be skipped for this subject.\n\n"
                        % subject_id)
                # TODO
                # if 'func' in c.run_longitudinal:
                #     strat_list = func_preproc_longitudinal_wf(subject_id, sub_list, c)
                #     func_longitudinal_template_wf(subject_id, strat_list, c)

            if valid_longitudinal_data:
                rsc_file_list = []
                for dirpath, dirnames, filenames in os.walk(c.outputDirectory):
                    for f in filenames:
                        # TODO is there a better way to check output folder name?
                        if f != '.DS_Store' and 'pipeline_analysis_longitudinal' in dirpath:
                            rsc_file_list.append(os.path.join(dirpath, f))

                subject_specific_dict = {
                    subj: []
                    for subj in subject_id_dict.keys()
                }
                session_specific_dict = {
                    os.path.join(session['subject_id'], session['unique_id']):
                    []
                    for session in sublist
                }
                for rsc_path in rsc_file_list:
                    key = [
                        s for s in session_specific_dict.keys()
                        if s in rsc_path
                    ]
                    if key:
                        session_specific_dict[key[0]].append(rsc_path)
                    else:
                        subj = [
                            s for s in subject_specific_dict.keys()
                            if s in rsc_path
                        ]
                        if subj:
                            subject_specific_dict[subj[0]].append(rsc_path)

                # update individual-specific outputs:
                # anatomical_brain, anatomical_brain_mask and anatomical_reorient
                for key in session_specific_dict.keys():
                    for f in session_specific_dict[key]:
                        sub, ses = key.split('/')
                        ses_list = [
                            subj for subj in sublist
                            if sub in subj['subject_id']
                            and ses in subj['unique_id']
                        ]
                        if len(ses_list) > 1:
                            raise Exception(
                                "There are several files containing " + f)
                        if len(ses_list) == 1:
                            ses = ses_list[0]
                            subj_id = ses['subject_id']
                            tmp = f.split(c.outputDirectory)[-1]
                            keys = tmp.split(os.sep)
                            if keys[0] == '':
                                keys = keys[1:]
                            if len(keys) > 1:
                                if ses.get('resource_pool') is None:
                                    ses['resource_pool'] = {
                                        keys[0].split(c.pipelineName + '_')[-1]:
                                        {
                                            keys[-2]: f
                                        }
                                    }
                                else:
                                    strat_key = keys[0].split(c.pipelineName +
                                                              '_')[-1]
                                    if ses['resource_pool'].get(
                                            strat_key) is None:
                                        ses['resource_pool'].update(
                                            {strat_key: {
                                                keys[-2]: f
                                            }})
                                    else:
                                        ses['resource_pool'][strat_key].update(
                                            {keys[-2]: f})

                for key in subject_specific_dict:
                    for f in subject_specific_dict[key]:
                        ses_list = [
                            subj for subj in sublist if key in subj['anat']
                        ]
                        for ses in ses_list:
                            tmp = f.split(c.outputDirectory)[-1]
                            keys = tmp.split(os.sep)
                            if keys[0] == '':
                                keys = keys[1:]
                            if len(keys) > 1:
                                if ses.get('resource_pool') is None:
                                    ses['resource_pool'] = {
                                        keys[0].split(c.pipelineName + '_')[-1]:
                                        {
                                            keys[-2]: f
                                        }
                                    }
                                else:
                                    strat_key = keys[0].split(c.pipelineName +
                                                              '_')[-1]
                                    if ses['resource_pool'].get(
                                            strat_key) is None:
                                        ses['resource_pool'].update(
                                            {strat_key: {
                                                keys[-2]: f
                                            }})
                                    else:
                                        if keys[-2] == 'anatomical_brain' or keys[
                                                -2] == 'anatomical_brain_mask' or keys[
                                                    -2] == 'anatomical_skull_leaf':
                                            pass
                                        elif 'apply_warp_anat_longitudinal_to_standard' in keys[
                                                -2] or 'fsl_apply_xfm_longitudinal' in keys[
                                                    -2]:
                                            # TODO update!!!
                                            # it assumes session id == last key (ordered by session count instead of session id) + 1
                                            # might cause problem if session id is not continuous
                                            def replace_index(
                                                    target1, target2,
                                                    file_path):
                                                index1 = file_path.index(
                                                    target1) + len(target1)
                                                index2 = file_path.index(
                                                    target2) + len(target2)
                                                file_str_list = list(file_path)
                                                file_str_list[index1] = "*"
                                                file_str_list[index2] = "*"
                                                file_path_updated = "".join(
                                                    file_str_list)
                                                file_list = glob.glob(
                                                    file_path_updated)
                                                file_list.sort()
                                                return file_list

                                            if ses['unique_id'] == str(
                                                    int(keys[-2][-1]) + 1):
                                                if keys[-3] == 'seg_probability_maps':
                                                    f_list = replace_index(
                                                        'seg_probability_maps_',
                                                        'segment_prob_', f)
                                                    ses['resource_pool'][
                                                        strat_key].update(
                                                            {keys[-3]: f_list})
                                                elif keys[
                                                        -3] == 'seg_partial_volume_files':
                                                    f_list = replace_index(
                                                        'seg_partial_volume_files_',
                                                        'segment_pve_', f)
                                                    ses['resource_pool'][
                                                        strat_key].update(
                                                            {keys[-3]: f_list})
                                                else:
                                                    ses['resource_pool'][
                                                        strat_key].update({
                                                            keys[-3]:
                                                            f  # keys[-3]: 'anatomical_to_standard'
                                                        })
                                        elif keys[-2] != 'warp_list':
                                            ses['resource_pool'][
                                                strat_key].update(
                                                    {keys[-2]: f})
                                        elif keys[-2] == 'warp_list':
                                            if 'ses-' + ses['unique_id'] in tmp:
                                                ses['resource_pool'][
                                                    strat_key].update(
                                                        {keys[-2]: f})

                for key in subject_specific_dict:
                    ses_list = [
                        subj for subj in sublist if key in subj['anat']
                    ]
                    for ses in ses_list:
                        for reg_strat in strat_list:
                            try:
                                ss_strat_list = list(ses['resource_pool'])
                                for strat_key in ss_strat_list:
                                    try:
                                        ses['resource_pool'][strat_key].update(
                                            {
                                                'registration_method':
                                                reg_strat[
                                                    'registration_method']
                                            })
                                    except KeyError:
                                        pass
                            except KeyError:
                                pass

                yaml.dump(sublist,
                          open(
                              os.path.join(c.workingDirectory,
                                           'data_config_longitudinal.yml'),
                              'w'),
                          default_flow_style=False)

                print('\n\n' + 'Longitudinal pipeline completed.' + '\n\n')

                # skip main preprocessing
                if 1 not in c.runAnatomical and 1 not in c.runFunctional:
                    import sys
                    sys.exit()

        # END LONGITUDINAL TEMPLATE PIPELINE

        # If it only allows one, run it linearly
        if c.numParticipantsAtOnce == 1:
            for sub in sublist:
                run_workflow(sub, c, True, pipeline_timing_info, p_name,
                             plugin, plugin_args, test_config)
            return

        pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w')

        # Init job queue
        job_queue = []

        # Allocate processes
        processes = [
            Process(target=run_workflow,
                    args=(sub, c, True, pipeline_timing_info, p_name, plugin,
                          plugin_args, test_config)) for sub in sublist
        ]

        # If we're allocating more processes than are subjects, run them all
        if len(sublist) <= c.numParticipantsAtOnce:
            for p in processes:
                p.start()
                print(p.pid, file=pid)

        # Otherwise manage resources to run processes incrementally
        else:
            idx = 0
            while idx < len(sublist):
                # If the job queue is empty and we haven't started indexing
                if len(job_queue) == 0 and idx == 0:
                    # Init subject process index
                    idc = idx
                    # Launch processes (one for each subject)
                    for p in processes[idc:idc + c.numParticipantsAtOnce]:
                        p.start()
                        print(p.pid, file=pid)
                        job_queue.append(p)
                        idx += 1
                # Otherwise, jobs are running - check them
                else:
                    # Check every job in the queue's status
                    for job in job_queue:
                        # If the job is not alive
                        if not job.is_alive():
                            # Find job and delete it from queue
                            print('found dead job ', job)
                            loc = job_queue.index(job)
                            del job_queue[loc]
                            # ...and start the next available process
                            # (subject)
                            processes[idx].start()
                            # Append this to job queue and increment index
                            job_queue.append(processes[idx])
                            idx += 1
                    # Add sleep so while loop isn't consuming 100% of CPU
                    time.sleep(2)
        # Close PID txt file to indicate finish
        pid.close()