Esempio n. 1
0
def run(config_file,
        subject_list_file,
        p_name=None,
        plugin=None,
        plugin_args=None,
        tracking=True,
        num_subs_at_once=None):
    '''
    '''

    # Import packages
    import commands
    import os
    import pickle
    import time

    from CPAC.pipeline.cpac_pipeline import prep_workflow

    # Init variables
    config_file = os.path.realpath(config_file)
    subject_list_file = os.path.realpath(subject_list_file)

    # take date+time stamp for run identification purposes
    unique_pipeline_id = strftime("%Y%m%d%H%M%S")
    pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S")

    # Load in pipeline config file
    try:
        if not os.path.exists(config_file):
            raise IOError
        else:
            c = Configuration(yaml.load(open(config_file, 'r')))
    except IOError:
        print "config file %s doesn't exist" % config_file
        raise
    except Exception as e:
        raise Exception("Error reading config file - {0}\n\nError details:"
                        "\n{1}\n\n".format(config_file, e))

    c.logDirectory = os.path.abspath(c.logDirectory)
    c.workingDirectory = os.path.abspath(c.workingDirectory)
    c.outputDirectory = os.path.abspath(c.outputDirectory)
    c.crashLogDirectory = os.path.abspath(c.crashLogDirectory)

    if num_subs_at_once:
        if not str(num_subs_at_once).isdigit():
            raise Exception('[!] Value entered for --num_cores not a digit.')
        c.numParticipantsAtOnce = int(num_subs_at_once)

    # Do some validation
    validate(c)

    # Get the pipeline name
    p_name = p_name or c.pipelineName

    # Load in subject list
    try:
        with open(subject_list_file, 'r') as sf:
            sublist = yaml.load(sf)
    except:
        print "Subject list is not in proper YAML format. Please check " \
              "your file"
        raise Exception

    # NOTE: strategies list is only needed in cpac_pipeline prep_workflow for
    # creating symlinks
    strategies = sorted(build_strategies(c))

    # Populate subject scan map
    sub_scan_map = {}
    try:
        for sub in sublist:
            if sub['unique_id']:
                s = sub['subject_id'] + "_" + sub["unique_id"]
            else:
                s = sub['subject_id']
            scan_ids = ['scan_anat']

            if 'func' in sub:
                for id in sub['func']:
                    scan_ids.append('scan_' + str(id))

            if 'rest' in sub:
                for id in sub['rest']:
                    scan_ids.append('scan_' + str(id))

            sub_scan_map[s] = scan_ids
    except:
        print "\n\n" + "ERROR: Subject list file not in proper format - " \
              "check if you loaded the correct file?" + "\n" + \
              "Error name: cpac_runner_0001" + "\n\n"
        raise Exception

    pipeline_timing_info = []
    pipeline_timing_info.append(unique_pipeline_id)
    pipeline_timing_info.append(pipeline_start_stamp)
    pipeline_timing_info.append(len(sublist))

    if tracking:
        track_run(level='participant', participants=len(sublist))

    # If we're running on cluster, execute job scheduler
    if c.runOnGrid:

        # Create cluster log dir
        cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files')
        if not os.path.exists(cluster_files_dir):
            os.makedirs(cluster_files_dir)

        # Create strategies file
        strategies_file = os.path.join(cluster_files_dir, 'strategies.pkl')
        with open(strategies_file, 'w') as f:
            pickle.dump(strategies, f)

        # Check if its a condor job, and run that
        if 'condor' in c.resourceManager.lower():
            run_condor_jobs(c, config_file, strategies_file, subject_list_file,
                            p_name)
        # All other schedulers are supported
        else:
            run_cpac_on_cluster(config_file, subject_list_file,
                                strategies_file, cluster_files_dir)

    # Run on one computer
    else:

        if not os.path.exists(c.workingDirectory):
            try:
                os.makedirs(c.workingDirectory)
            except:
                err = "\n\n[!] CPAC says: Could not create the working " \
                      "directory: %s\n\nMake sure you have permissions " \
                      "to write to this directory.\n\n" % c.workingDirectory
                raise Exception(err)

        # If it only allows one, run it linearly
        if c.numParticipantsAtOnce == 1:
            for sub in sublist:
                prep_workflow(sub, c, strategies, 1, pipeline_timing_info,
                              p_name, plugin, plugin_args)
            return

        pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w')

        # Init job queue
        job_queue = []

        # Allocate processes
        processes = [
            Process(target=prep_workflow,
                    args=(sub, c, strategies, 1, pipeline_timing_info, p_name,
                          plugin, plugin_args)) for sub in sublist
        ]

        # If we're allocating more processes than are subjects, run them all
        if len(sublist) <= c.numParticipantsAtOnce:
            for p in processes:
                p.start()
                print >> pid, p.pid

        # Otherwise manage resources to run processes incrementally
        else:
            idx = 0
            while idx < len(sublist):
                # If the job queue is empty and we haven't started indexing
                if len(job_queue) == 0 and idx == 0:
                    # Init subject process index
                    idc = idx
                    # Launch processes (one for each subject)
                    for p in processes[idc:idc + c.numParticipantsAtOnce]:
                        p.start()
                        print >> pid, p.pid
                        job_queue.append(p)
                        idx += 1
                # Otherwise, jobs are running - check them
                else:
                    # Check every job in the queue's status
                    for job in job_queue:
                        # If the job is not alive
                        if not job.is_alive():
                            # Find job and delete it from queue
                            print 'found dead job ', job
                            loc = job_queue.index(job)
                            del job_queue[loc]
                            # ...and start the next available process
                            # (subject)
                            processes[idx].start()
                            # Append this to job queue and increment index
                            job_queue.append(processes[idx])
                            idx += 1
                    # Add sleep so while loop isn't consuming 100% of CPU
                    time.sleep(2)
        # Close PID txt file to indicate finish
        pid.close()
Esempio n. 2
0
def run(config_file, subject_list_file, p_name=None, plugin=None,
        plugin_args=None, tracking=True, num_subs_at_once=None, debug=False):
    '''
    '''

    # Import packages
    import commands
    import os
    import pickle
    import time

    from CPAC.pipeline.cpac_pipeline import prep_workflow

    # Init variables
    config_file = os.path.realpath(config_file)
    subject_list_file = os.path.realpath(subject_list_file)

    # take date+time stamp for run identification purposes
    unique_pipeline_id = strftime("%Y%m%d%H%M%S")
    pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S")

    # Load in pipeline config file
    try:
        if not os.path.exists(config_file):
            raise IOError
        else:
            c = Configuration(yaml.load(open(config_file, 'r')))
    except IOError:
        print "config file %s doesn't exist" % config_file
        raise
    except Exception as e:
        raise Exception("Error reading config file - {0}\n\nError details:"
                        "\n{1}\n\n".format(config_file, e))

    c.logDirectory = os.path.abspath(c.logDirectory)
    c.workingDirectory = os.path.abspath(c.workingDirectory)
    if 's3://' not in c.outputDirectory:
        c.outputDirectory = os.path.abspath(c.outputDirectory)
    c.crashLogDirectory = os.path.abspath(c.crashLogDirectory)

    if debug:
        c.write_debugging_outputs = "[1]"

    if num_subs_at_once:
        if not str(num_subs_at_once).isdigit():
            raise Exception('[!] Value entered for --num_cores not a digit.')
        c.numParticipantsAtOnce = int(num_subs_at_once)

    # Do some validation
    validate(c)

    # Get the pipeline name
    p_name = p_name or c.pipelineName

    # Load in subject list
    try:
        with open(subject_list_file, 'r') as sf:
            sublist = yaml.load(sf)
    except:
        print "Subject list is not in proper YAML format. Please check " \
              "your file"
        raise Exception

    # Populate subject scan map
    sub_scan_map = {}
    try:
        for sub in sublist:
            if sub['unique_id']:
                s = sub['subject_id'] + "_" + sub["unique_id"]
            else:
                s = sub['subject_id']
            scan_ids = ['scan_anat']

            if 'func' in sub:
                for id in sub['func']:
                    scan_ids.append('scan_'+ str(id))

            if 'rest' in sub:
                for id in sub['rest']:
                    scan_ids.append('scan_'+ str(id))

            sub_scan_map[s] = scan_ids
    except:
        print "\n\n" + "ERROR: Subject list file not in proper format - " \
              "check if you loaded the correct file?" + "\n" + \
              "Error name: cpac_runner_0001" + "\n\n"
        raise Exception

    pipeline_timing_info = []
    pipeline_timing_info.append(unique_pipeline_id)
    pipeline_timing_info.append(pipeline_start_stamp)
    pipeline_timing_info.append(len(sublist))

    if tracking:
        track_run(level='participant', participants=len(sublist))

    # If we're running on cluster, execute job scheduler
    if c.runOnGrid:

        # Create cluster log dir
        cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files')
        if not os.path.exists(cluster_files_dir):
            os.makedirs(cluster_files_dir)

        # Check if its a condor job, and run that
        if 'condor' in c.resourceManager.lower():
            run_condor_jobs(c, config_file, subject_list_file, p_name)
        # All other schedulers are supported
        else:
            run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir)

    # Run on one computer
    else:

        if not os.path.exists(c.workingDirectory):
            try:
                os.makedirs(c.workingDirectory)
            except:
                err = "\n\n[!] CPAC says: Could not create the working " \
                      "directory: %s\n\nMake sure you have permissions " \
                      "to write to this directory.\n\n" % c.workingDirectory
                raise Exception(err)

        # If it only allows one, run it linearly
        if c.numParticipantsAtOnce == 1:
            for sub in sublist:
                prep_workflow(sub, c, True, pipeline_timing_info,
                              p_name, plugin, plugin_args)
            return
                
        pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w')

        # Init job queue
        job_queue = []

        # Allocate processes
        processes = [Process(target=prep_workflow,
                          args=(sub, c, True, pipeline_timing_info,
                                p_name, plugin, plugin_args))
                  for sub in sublist]

        # If we're allocating more processes than are subjects, run them all
        if len(sublist) <= c.numParticipantsAtOnce:
            for p in processes:
                p.start()
                print >>pid, p.pid

        # Otherwise manage resources to run processes incrementally
        else:
            idx = 0
            while idx < len(sublist):
                # If the job queue is empty and we haven't started indexing
                if len(job_queue) == 0 and idx == 0:
                    # Init subject process index
                    idc = idx
                    # Launch processes (one for each subject)
                    for p in processes[idc: idc+c.numParticipantsAtOnce]:
                        p.start()
                        print >>pid, p.pid
                        job_queue.append(p)
                        idx += 1
                # Otherwise, jobs are running - check them
                else:
                    # Check every job in the queue's status
                    for job in job_queue:
                        # If the job is not alive
                        if not job.is_alive():
                            # Find job and delete it from queue
                            print 'found dead job ', job
                            loc = job_queue.index(job)
                            del job_queue[loc]
                            # ...and start the next available process
                            # (subject)
                            processes[idx].start()
                            # Append this to job queue and increment index
                            job_queue.append(processes[idx])
                            idx += 1
                    # Add sleep so while loop isn't consuming 100% of CPU
                    time.sleep(2)
        # Close PID txt file to indicate finish
        pid.close()
Esempio n. 3
0
def run(subject_list_file, config_file=None, p_name=None, plugin=None,
        plugin_args=None, tracking=True, num_subs_at_once=None, debug=False, test_config=False):

    # Import packages
    import commands
    import os
    import pickle
    import time

    from CPAC.pipeline.cpac_pipeline import prep_workflow

    print('Run called with config file {0}'.format(config_file))

    if not config_file:
        import pkg_resources as p
        config_file = \
            p.resource_filename("CPAC",
                                os.path.join("resources",
                                             "configs",
                                             "pipeline_config_template.yml"))

    # Init variables
    sublist = None
    config_file = os.path.realpath(config_file)
    if '.yaml' in subject_list_file or '.yml' in subject_list_file:
        subject_list_file = os.path.realpath(subject_list_file)
    else:
        from CPAC.utils.bids_utils import collect_bids_files_configs, \
            bids_gen_cpac_sublist
        (file_paths, config) = collect_bids_files_configs(subject_list_file,
                                                          None)
        sublist = bids_gen_cpac_sublist(subject_list_file, file_paths,
                                        config, None)
        if not sublist:
            import sys
            print("Did not find data in {0}".format(subject_list_file))
            sys.exit(1)

    # take date+time stamp for run identification purposes
    unique_pipeline_id = strftime("%Y%m%d%H%M%S")
    pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S")

    # Load in pipeline config file
    try:
        if not os.path.exists(config_file):
            raise IOError
        else:
            c = Configuration(yaml.load(open(config_file, 'r')))
    except IOError:
        print "config file %s doesn't exist" % config_file
        raise
    except yaml.parser.ParserError as e:
        error_detail = "\"%s\" at line %d" % (
            e.problem,
            e.problem_mark.line
        )
        raise Exception(
            "Error parsing config file: {0}\n\n"
            "Error details:\n"
            "    {1}"
            "\n\n".format(config_file, error_detail)
        )
    except Exception as e:
        raise Exception(
            "Error parsing config file: {0}\n\n"
            "Error details:\n"
            "    {1}"
            "\n\n".format(config_file, e)
        )

    c.logDirectory = os.path.abspath(c.logDirectory)
    c.workingDirectory = os.path.abspath(c.workingDirectory)
    if 's3://' not in c.outputDirectory:
        c.outputDirectory = os.path.abspath(c.outputDirectory)
    c.crashLogDirectory = os.path.abspath(c.crashLogDirectory)

    if debug:
        c.write_debugging_outputs = "[1]"

    if num_subs_at_once:
        if not str(num_subs_at_once).isdigit():
            raise Exception('[!] Value entered for --num_cores not a digit.')
        c.numParticipantsAtOnce = int(num_subs_at_once)

    # Do some validation
    if not c.workingDirectory:
        raise Exception('Working directory not specified')

    if len(c.workingDirectory) > 70:
        warnings.warn("We recommend that the working directory full path "
                      "should have less then 70 characters. "
                      "Long paths might not work in your operational system.")
        warnings.warn("Current working directory: %s" % c.workingDirectory)

    # Get the pipeline name
    p_name = p_name or c.pipelineName

    # Load in subject list
    try:
        if not sublist:
            with open(subject_list_file, 'r') as sf:
                sublist = yaml.load(sf)
    except:
        print "Subject list is not in proper YAML format. Please check " \
              "your file"
        raise Exception

    # Populate subject scan map
    sub_scan_map = {}
    try:
        for sub in sublist:
            if sub['unique_id']:
                s = sub['subject_id'] + "_" + sub["unique_id"]
            else:
                s = sub['subject_id']
            scan_ids = ['scan_anat']

            if 'func' in sub:
                for id in sub['func']:
                    scan_ids.append('scan_'+ str(id))

            if 'rest' in sub:
                for id in sub['rest']:
                    scan_ids.append('scan_'+ str(id))

            sub_scan_map[s] = scan_ids
    except:
        print "\n\n" + "ERROR: Subject list file not in proper format - " \
              "check if you loaded the correct file?" + "\n" + \
              "Error name: cpac_runner_0001" + "\n\n"
        raise Exception

    pipeline_timing_info = []
    pipeline_timing_info.append(unique_pipeline_id)
    pipeline_timing_info.append(pipeline_start_stamp)
    pipeline_timing_info.append(len(sublist))

    if tracking:
        try:
            track_run(level='participant', participants=len(sublist))
        except:
            pass

    # If we're running on cluster, execute job scheduler
    if c.runOnGrid:

        # Create cluster log dir
        cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files')
        if not os.path.exists(cluster_files_dir):
            os.makedirs(cluster_files_dir)

        # Check if its a condor job, and run that
        if 'condor' in c.resourceManager.lower():
            run_condor_jobs(c, config_file, subject_list_file, p_name)
        # All other schedulers are supported
        else:
            run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir)

    # Run on one computer
    else:

        if not os.path.exists(c.workingDirectory):
            try:
                os.makedirs(c.workingDirectory)
            except:
                err = "\n\n[!] CPAC says: Could not create the working " \
                      "directory: %s\n\nMake sure you have permissions " \
                      "to write to this directory.\n\n" % c.workingDirectory
                raise Exception(err)

        # If it only allows one, run it linearly
        if c.numParticipantsAtOnce == 1:
            for sub in sublist:
                prep_workflow(sub, c, True, pipeline_timing_info,
                              p_name, plugin, plugin_args, test_config)
            return
                
        pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w')

        # Init job queue
        job_queue = []

        # Allocate processes
        processes = [
            Process(target=prep_workflow,
                    args=(sub, c, True, pipeline_timing_info,
                          p_name, plugin, plugin_args, test_config))
            for sub in sublist
        ]

        # If we're allocating more processes than are subjects, run them all
        if len(sublist) <= c.numParticipantsAtOnce:
            for p in processes:
                p.start()
                print >>pid, p.pid

        # Otherwise manage resources to run processes incrementally
        else:
            idx = 0
            while idx < len(sublist):
                # If the job queue is empty and we haven't started indexing
                if len(job_queue) == 0 and idx == 0:
                    # Init subject process index
                    idc = idx
                    # Launch processes (one for each subject)
                    for p in processes[idc: idc+c.numParticipantsAtOnce]:
                        p.start()
                        print >>pid, p.pid
                        job_queue.append(p)
                        idx += 1
                # Otherwise, jobs are running - check them
                else:
                    # Check every job in the queue's status
                    for job in job_queue:
                        # If the job is not alive
                        if not job.is_alive():
                            # Find job and delete it from queue
                            print 'found dead job ', job
                            loc = job_queue.index(job)
                            del job_queue[loc]
                            # ...and start the next available process
                            # (subject)
                            processes[idx].start()
                            # Append this to job queue and increment index
                            job_queue.append(processes[idx])
                            idx += 1
                    # Add sleep so while loop isn't consuming 100% of CPU
                    time.sleep(2)
        # Close PID txt file to indicate finish
        pid.close()