def SimpleJobExample(configuration_item_name, userid, userpw=None):
    """ Dummy workflow to test the install

    Parameters
    ----------
    configuration_item_name: str
        the name of the configuration item (ex. "Gabriel")
    userid: str
        user name on the server side
    userpw: str (optional)
        user password to login the server using ssh.
        If you want to use "id_rsa.pub", just leave userpw to None
        To copy the public key on the server use ssh-copy-id -i name@server.
    """
    job_1 = Job(command=["sleep", "5"], name="job 1")
    job_2 = Job(command=["sleep", "5"], name="job 2")
    job_3 = Job(command=["sleep", "5"], name="job 3")
    job_4 = Job(command=["sleep", "5"], name="job 4")

    jobs = [job_1, job_2, job_3, job_4]
    dependencies = [(job_1, job_2),
                    (job_1, job_3),
                    (job_2, job_4),
                    (job_3, job_4)]

    workflow = Workflow(jobs=jobs,
                        dependencies=dependencies)

    controller = WorkflowController(configuration_item_name, userid, userpw)

    controller.submit_workflow(workflow=workflow,
                               name="TestConnectionExample")
def SimpleJobExample(configuration_item_name, userid, userpw=None):
    """ Dummy workflow to test the install

    Parameters
    ----------
    configuration_item_name: str
        the name of the configuration item (ex. "Gabriel")
    userid: str
        user name on the server side
    userpw: str (optional)
        user password to login the server using ssh.
        If you want to use "id_rsa.pub", just leave userpw to None
        To copy the public key on the server use ssh-copy-id -i name@server.
    """
    job_1 = Job(command=["sleep", "5"], name="job 1")
    job_2 = Job(command=["sleep", "5"], name="job 2")
    job_3 = Job(command=["sleep", "5"], name="job 3")
    job_4 = Job(command=["sleep", "5"], name="job 4")

    jobs = [job_1, job_2, job_3, job_4]
    dependencies = [(job_1, job_2), (job_1, job_3), (job_2, job_4),
                    (job_3, job_4)]

    workflow = Workflow(jobs=jobs, dependencies=dependencies)

    controller = WorkflowController(configuration_item_name, userid, userpw)

    controller.submit_workflow(workflow=workflow, name="TestConnectionExample")
Beispiel #3
0
    def iteration_run(self):
        # this method should be replaced by a call to
        # pipeline_workflow.workflow_from_pipeline()
        # (but first, the iteration has to be an actual pipeline)
        from soma_workflow.client import Job, Workflow, WorkflowController

        print 'ITERATION RUN'
        jobs = {}
        i = 0
        for process in self.list_process_iteration:
            jobs['job'+str(i)] = Job(command=process.command())
            i = i+1

        wf = Workflow(jobs=[value for value in \
            jobs.itervalues()], name='test')
        # Helper.serialize('/tmp/test_wf',wf)
        controller = WorkflowController()
        controller.submit_workflow(workflow=wf, name='test run')
Beispiel #4
0
 def setup_connection(cls, resource_id, login, password):
     cls.login = login
     cls.password = password
     cls.resource_id = resource_id
     cls.wf_ctrl = WorkflowController(resource_id, login, password)
     cls.transfer_timeout = -24
     cls.jobs_timeout = 1
     cls.job_examples = JobExamples(cls.wf_ctrl, 'python',
                                    cls.transfer_timeout, cls.jobs_timeout)
Beispiel #5
0
 def _setup_soma_workflow_controller(self, create_new=False):
     resource_id, login, password, rsa_key_pass \
         = self.get_soma_workflow_credentials()
     config_file_path = swconf.Configuration.search_config_path()
     try:
         sw_config = swconf.Configuration.load_from_file(
             resource_id, config_file_path)
     except swconf.ConfigurationError:
         sw_config = None
         resource_id = None
     if self._workflow_controller is None or create_new:
         self._workflow_controller = WorkflowController(
             resource_id, login, password=None, config=sw_config,
             rsa_key_pass=None)
         self._delete_old_workflows()
Beispiel #6
0
def remote_map_marshal(func, largs=None, lkwargs=None, mode='local'):

    if largs is None:
        if lkwargs is not None:
            largs = [[]] * len(lkwargs)
        else:
            largs = []

    if lkwargs is None:
        lkwargs = [{}] * len(largs)

    lkwargs = [merge_default_kwargs(func, kw) for kw in lkwargs]

    assert len(lkwargs) == len(largs)

    all_args = zip(largs, lkwargs)

    if mode == 'local':
        return [func(*args, **kwargs) for args, kwargs in all_args]
    elif mode == 'local_with_dumps':

        func_fn = './func.marshal'
        dump_func(func, func_fn)
        results = []
        for i, params in enumerate(all_args):
            print 'params:', params
            params_fn = 'params_%d.pck' % i
            fparams = open(params_fn, 'wb')
            cPickle.dump(params, fparams)
            fparams.close()
            output_fn = 'output_%d.pck' % i
            print 'call subprocess ...'
            subprocess.call(['python', '-c', cfunc_marshal, params_fn,
                             func_fn, output_fn])
            print 'Read outputs ...'
            fout = open(output_fn)
            results.append(cPickle.load(fout))
        return results
    elif mode == 'remote_cluster':
        # FileTransfer creation for input files
        #data_dir = './rmap_data'
        data_dir = mkdtemp(prefix="sw_rmap")
        func_fn = op.join(data_dir, 'func.marshal')
        dump_func(func, func_fn)
        func_file = FileTransfer(is_input=True,
                                 client_path=func_fn,
                                 name="func_file")

        all_jobs = []
        param_files = []
        for i, params in enumerate(all_args):
            params_fn = op.join(data_dir, 'params_%d.pck' % i)
            fparams = open(params_fn, 'wb')
            cPickle.dump(params, fparams)
            fparams.close()
            param_file = FileTransfer(is_input=True,
                                      client_path=params_fn,
                                      name='params_file_%d' % i)
            param_files.append(param_file)
            output_fn = op.join(data_dir, 'output_%d.pck' % i)
            output_file = FileTransfer(is_input=False,
                                       client_path=output_fn,
                                       name='output_file_%d' % i)
            job = Job(command=['python', '-c', cfunc, param_file, func_file,
                               output_file],
                      name="rmap, item %d" % i,
                      referenced_input_files=[func_file, param_file],
                      referenced_output_files=[output_file])
            all_jobs.append(job)

        workflow = Workflow(jobs=all_jobs, dependencies=[])
        # submit the workflow
        cfg = pyhrf.cfg['parallel-cluster']
        controller = WorkflowController(cfg['server_id'], cfg['user'])

        # controller.transfer_files(fids_to_transfer)
        wf_id = controller.submit_workflow(
            workflow=workflow, name="remote_map")

        Helper.transfer_input_files(wf_id, controller)

        Helper.wait_workflow(wf_id, controller)

        Helper.transfer_output_files(wf_id, controller)

        results = []
        for i in xrange(len(all_args)):
            fout = open(op.join(data_dir, 'output_%d.pck' % i))
            results.append(cPickle.load(fout))
            fout.close()
        return results
Beispiel #7
0
group_elements = []

first_job = Job(command=["sleep", "10"], name="first job")
last_job = Job(command=["sleep", "10"], name="last job")

jobs.append(first_job)
jobs.append(last_job)

for i in range(0, 30):
    job = Job(command=["sleep", "60"], name="job " + repr(i))

    jobs.append(job)

    dependencies.append((first_job, job))
    dependencies.append((job, last_job))

    group_elements.append(job)

thirty_jobs_group = Group(elements=group_elements, name="my 30 jobs")

workflow = Workflow(jobs=jobs,
                    dependencies=dependencies,
                    root_group=[first_job, thirty_jobs_group, last_job])

login = '******'
password = '******'
controller = WorkflowController("DSV_cluster", login, password)

controller.submit_workflow(workflow=workflow,
                           name="Simple workflow with group")
Beispiel #8
0
def morphologist_all(t1file, sid, outdir, study="morphologist", waittime=10,
                     somaworkflow=False,
                     spmexec="/i2bm/local/spm8-standalone/run_spm8.sh",
                     spmdir="/i2bm/local/spm8-standalone"):
    """ Performs all the Morphologist steps.

    Steps:

    1- Ensure image orientation and reorient it if needed (Prepare Subject for
       Anatomical Pipeline).
    2- Computation of a brain mask (Brain Mask Segmentation).
    3- Computation of a mask for each hemisphere (Split Brain Mask).
    4- A grey/white classification of each hemisphere to perform "Voxel Based
       Morphometry" (Grey White Classification) and spherical triangulation of
       cortical hemispheres (Grey White Surface).
    5- Spherical triangulation of the external interface of the cortex of one
       or two hemispheres (Get Spherical Hemi Surface).
    6- Computation of a graph representing the cortical fold topography
       (Cortical Fold Graph).
    7- Automatic identification of the cortical sulci (Automatic Sulci
       Recognition), located in the "sulci" toolbox.

    The execution is performed with soma_workflow that has to be installed in
    the bv_env environment.

    To check the worklow submission, use the 'soma_workflow_gui' command.

    If the input 't1file' has no the expected extension, an Exception will
    be raised.
    If the $outdir/$study/$sid has already been created, an Exception will
    be raised.

    Parameters
    ----------
    t1file: str (mandatory)
        the path to a ".nii.gz" anatomical T1 weighted file.
    sid: str (mandatory)
        a subject identifier.
    outdir: str (mandatory)
        the morphologist output files will be written in $outdir/$study/$sid.
    study: str (mandatory)
        the name of the study.
    waittime: float (optional, default 10)
        a delay (in seconds) used to check the worflow status.
    somaworkflow: bool (optional, default False)
        if True use somaworkflow for the execution.
    spmexec: str (optional)
        the path to the standalone SPM execution file.
    spmdir: str (optional)
        the standalone SPM directory.

    Returns
    -------
    wffile: str
        a file containing the submitted workflow.
    wfid: int
        the submitted workflow identifier.
    wfstatus: str
        the submited worflow status afer 'waittime' seconds.
    """
    # Check roughly the input file extension
    if not t1file.endswith(".nii.gz"):
        raise Exception("'{0}' is not a COMPRESSED NIFTI file.".format(t1file))

    # Create a configuration for the morphologist study
    study_config = StudyConfig(
        modules=StudyConfig.default_modules + ["FomConfig", "BrainVISAConfig"])
    study_dict = {
        "name": "morphologist_fom",
        "input_directory": outdir,
        "output_directory": outdir,
        "input_fom": "morphologist-auto-nonoverlap-1.0",
        "output_fom": "morphologist-auto-nonoverlap-1.0",
        "shared_fom": "shared-brainvisa-1.0",
        "spm_directory": spmdir,
        "use_soma_workflow": True,
        "use_fom": True,
        "spm_standalone": True,
        "use_matlab": False,
        "volumes_format": "NIFTI gz",
        "meshes_format": "GIFTI",
        "use_spm": True,
        "spm_exec": spmexec,
        "study_config.somaworkflow_computing_resource": "localhost",
        "somaworkflow_computing_resources_config": {
            "localhost": {
            }
        }
    }
    study_config.set_study_configuration(study_dict)

    # Create the morphologist pipeline
    pipeline = get_process_instance(
        "morphologist.capsul.morphologist.Morphologist")
    morphologist_pipeline = process_with_fom.ProcessWithFom(
        pipeline, study_config)
    morphologist_pipeline.attributes = dict(
        (trait_name, getattr(morphologist_pipeline, trait_name))
        for trait_name in morphologist_pipeline.user_traits())
    morphologist_pipeline.attributes["center"] = "morphologist"
    morphologist_pipeline.attributes["subject"] = sid
    morphologist_pipeline.create_completion()

    # Create morphologist expected tree
    # ToDo: use ImportT1 from axon
    subjectdir = os.path.join(outdir, study, sid)
    if os.path.isdir(subjectdir):
        raise Exception("Folder '{0}' already created.".format(subjectdir))
    os.makedirs(os.path.join(
        subjectdir, "t1mri", "default_acquisition",
        "default_analysis", "folds", "3.1", "default_session_auto"))
    os.makedirs(os.path.join(
        subjectdir, "t1mri", "default_acquisition",
        "registration"))
    os.makedirs(os.path.join(
        subjectdir, "t1mri", "default_acquisition",
        "segmentation", "mesh"))
    os.makedirs(os.path.join(
        subjectdir, "t1mri", "default_acquisition",
        "tmp"))

    # Copy T1 file in the morphologist expected location
    destfile = os.path.join(subjectdir, "t1mri",
                            "default_acquisition", sid + ".nii.gz")
    shutil.copy(t1file, destfile)

    # Create source_referential morphologist expected file
    source_referential = {"uuid": str(soma.uuid.Uuid())}
    referential_file = os.path.join(
        subjectdir, "t1mri", "default_acquisition", "registration",
        "RawT1-{0}_default_acquisition.referential".format(sid))
    attributes = "attributes = {0}".format(json.dumps(source_referential))
    with open(referential_file, "w") as openfile:
        openfile.write(attributes)

    # Create a worflow from the morphologist pipeline
    workflow = Workflow(name="{0} {1}".format(study, sid),
                        jobs=[])
    workflow.root_group = []

    # Create the workflow
    wf = pipeline_workflow.workflow_from_pipeline(
        morphologist_pipeline.process, study_config=study_config)
    workflow.add_workflow(wf, as_group="{0}_{1}".format(study, sid))
    wffile = os.path.join(subjectdir, "{0}.wf".format(study))
    pickle.dump(workflow, open(wffile, "w"))

    # Execute the workflow with somaworkflow
    if somaworkflow:
        controller = WorkflowController()
        wfid = controller.submit_workflow(
            workflow=workflow, name="{0}_{1}".format(study, sid))

        # Return the worflow status after execution
        while True:
            time.sleep(waittime)
            wfstatus = controller.workflow_status(wfid)
            if wfstatus not in [
                    "worklflow_not_started", "workflow_in_progress"]:
                break

    # Execute the workflow with subprocess
    else:
        # -> construct the ordered list of commands to be executed
        workflow_repr = workflow.to_dict()
        graph = Graph()
        for job in workflow_repr["jobs"]:
            graph.add_node(GraphNode(job, None))
        for link in workflow_repr["dependencies"]:
            graph.add_link(link[0], link[1])
        ordered_nodes = [str(node[0]) for node in graph.topological_sort()]
        commands = []
        jobs = workflow_repr["serialized_jobs"]
        temporaries = workflow_repr["serialized_temporary_paths"]
        barriers = workflow_repr["serialized_barriers"]
        for index in ordered_nodes:
            if index in jobs:
                commands.append(jobs[index]["command"])
            elif index in barriers:
                continue
            else:
                raise Exception("Unexpected node in workflow.")

        # -> Go through all commands
        tmpmap = {}
        for cmd in commands:
            # -> deal with temporary files
            for index, item in enumerate(cmd):
                if not isinstance(item, basestring):
                    if str(item) not in tmpmap:
                        if str(item) in temporaries:
                            struct = temporaries[str(item)]
                            name = cmd[2].split(";")[1].split()[-1]
                            tmppath = os.path.join(
                                subjectdir, "t1mri", "default_acquisition",
                                "tmp", str(item) + name + struct["suffix"])
                            tmpmap[str(item)] = tmppath
                        else:
                            raise MorphologistError(
                                "Can't complete command '{0}'.".format(
                                    cmd))
                    cmd[index] = tmpmap[str(item)]

            # -> execute the command
            worker = MorphologistWrapper(cmd)
            worker()
            if worker.exitcode != 0:
                raise MorphologistRuntimeError(
                    " ".join(worker.cmd), worker.stderr)

        wfstatus = "Done"
        wfid = "subprocess"

    return wffile, wfid, wfstatus
Beispiel #9
0
from soma_workflow.client import Job, Workflow, WorkflowController, SharedResourcePath, FileTransfer

# SharedResourcePath creation for the input file.
# The input file is read direclty in the data directory located on the
# the computing resource side.
myfile = SharedResourcePath(relative_path="myfile",
                            namespace="MyApp",
                            uuid="my_example_dir")

# FileTransfer creation for the output file.
# That way the output file will not be written in the data directory
# located on the computing resource file system.
copy_of_myfile = FileTransfer(
    is_input=False,
    client_path="/tmp/soma_workflow_examples/copy_of_myfile",
    name="copy of my file")

# Job and Workflow creation
copy_job = Job(command=["cp", myfile, copy_of_myfile],
               name="copy",
               referenced_input_files=[],
               referenced_output_files=[copy_of_myfile])

workflow = Workflow(jobs=[copy_job], dependencies=[])

# workflow submission
controller = WorkflowController("DSV_cluster", login, password)

controller.submit_workflow(workflow=workflow,
                           name="shared resource path example")
Beispiel #10
0
class  SomaWorkflowRunner(Runner):
    WORKFLOW_NAME_SUFFIX = "Morphologist user friendly analysis"

    def __init__(self, study):
        super(SomaWorkflowRunner, self).__init__(study)

        self._workflow_controller = None
        self._init_internal_parameters()

    def get_soma_workflow_credentials(self):
        resource_id = self._study.somaworkflow_computing_resource

        config_file_path = swconf.Configuration.search_config_path()
        resource_list = swconf.Configuration.get_configured_resources(
            config_file_path)
        login_list = swconf.Configuration.get_logins(config_file_path)
        login = None
        if resource_id in login_list:
            login = login_list[resource_id]

        password = None
        rsa_key_pass = None

        return resource_id, login, password, rsa_key_pass

    def _init_internal_parameters(self):
        self._workflow_id = None
        self._jobid_to_step = {} # subjectid -> (job_id -> step)
        self._cached_jobs_status = None

    def resource_id(self):
        if self._workflow_controller is None:
            resource_id = None
        else:
            resource_id = self._workflow_controller._resource_id
        return resource_id

    def update_controller(self):
        resource_id = self._study.somaworkflow_computing_resource
        if resource_id != self.resource_id():
            self._setup_soma_workflow_controller(create_new=True)

    def set_study(self, study):
        super(SomaWorkflowRunner, self).set_study(study)
        self.update_controller()

    def _setup_soma_workflow_controller(self, create_new=False):
        resource_id, login, password, rsa_key_pass \
            = self.get_soma_workflow_credentials()
        config_file_path = swconf.Configuration.search_config_path()
        try:
            sw_config = swconf.Configuration.load_from_file(
                resource_id, config_file_path)
        except swconf.ConfigurationError:
            sw_config = None
            resource_id = None
        if self._workflow_controller is None or create_new:
            self._workflow_controller = WorkflowController(
                resource_id, login, password=None, config=sw_config,
                rsa_key_pass=None)
            self._delete_old_workflows()

    def _delete_old_workflows(self):
        for (workflow_id, (name, _)) \
                in six.iteritems(self._workflow_controller.workflows()):
            if name is not None and name.endswith(self.WORKFLOW_NAME_SUFFIX):
                self._workflow_controller.delete_workflow(workflow_id)

    def run(self, subject_ids=ALL_SUBJECTS):
        self._setup_soma_workflow_controller()
        self._init_internal_parameters()
        if self._workflow_controller.scheduler_config:
            # in local mode only
            cpus_number = self._cpus_number()
            self._workflow_controller.scheduler_config.set_proc_nb(cpus_number)
        if subject_ids == ALL_SUBJECTS:
            subject_ids = self._study.subjects
        # setup shared path in study_config
        study_config = self._study
        swf_resource = study_config.somaworkflow_computing_resource
        if not self._workflow_controller.scheduler_config:
            # remote config only
            # FIXME: must check if brainvisa shared dir is known in translation
            # config in soma-workflow
            if not study_config.somaworkflow_computing_resources_config.trait(
                    swf_resource):
                setattr(study_config.somaworkflow_computing_resources_config,
                        swf_resource, {})
            resource_conf = getattr(
                study_config.somaworkflow_computing_resources_config,
                swf_resource)
            path_translations = resource_conf.path_translations
            setattr(path_translations, study_config.shared_directory,
                    ['brainvisa', 'de25977f-abf5-9f1c-4384-2585338cd7af'])

        #self._check_input_files(subject_ids)
        workflow = self._create_workflow(subject_ids)
        jobs = [j for j in workflow.jobs if isinstance(j, Job)]
        if self._workflow_id is not None:
            self._workflow_controller.delete_workflow(self._workflow_id)
        if len(jobs) == 0:
            # empty workflow: nothing to do
            self._workflow_id = None
            return
        self._workflow_id = self._workflow_controller.submit_workflow(
            workflow, name=workflow.name)
        self._build_jobid_to_step()

        # run transfers, if any
        Helper.transfer_input_files(self._workflow_id,
                                    self._workflow_controller)
        # the status does not change immediately after run,
        # so we wait for the status WORKFLOW_IN_PROGRESS or timeout
        status = self._workflow_controller.workflow_status(self._workflow_id)
        try_count = 8
        while ((status != sw.constants.WORKFLOW_IN_PROGRESS) and \
                                                (try_count > 0)):
            time.sleep(0.25)
            status = self._workflow_controller.workflow_status(
                self._workflow_id)
            try_count -= 1

    def _cpus_number(self):
        cpus_count = multiprocessing.cpu_count()
        cpus_settings = settings.runner.selected_processing_units_n
        if cpus_settings > cpus_count:
            print("Warning: bad setting value:\n" +
                  "  (selected_processing_units_n=%d) " % cpus_settings +
                  "> number of available processing units: %d" % cpus_count)
            cpus_number = min(cpus_settings, cpus_count)
        else:
            cpus_number = cpus_settings
        return cpus_number

    def _create_workflow(self, subject_ids):
        study_config = self._study
        workflow = Workflow(
            name='Morphologist UI - %s' % study_config.study_name,
            jobs=[])
        workflow.root_group = []
        initial_vol_format = study_config.volumes_format

        priority = (len(subject_ids) - 1) * 100
        for subject_id in subject_ids:
            analysis = self._study.analyses[subject_id]
            subject = self._study.subjects[subject_id]

            analysis.set_parameters(subject)
            #analysis.propagate_parameters()
            pipeline = analysis.pipeline
            pipeline.enable_all_pipeline_steps()
            # force highest priority normalization method
            # FIXME: specific knowledge of Morphologist should not be used here.
            pipeline.Normalization_select_Normalization_pipeline \
                  = 'NormalizeSPM'
            pipeline_tools.disable_runtime_steps_with_existing_outputs(
                pipeline)

            missing = pipeline_tools.nodes_with_missing_inputs(pipeline)
            if missing:
                self.check_missing_models(pipeline, missing)
                print('MISSING INPUTS IN NODES:', missing)
                raise MissingInputFileError("subject: %s" % subject_id)

            wf = pipeline_workflow.workflow_from_pipeline(
                pipeline, study_config=study_config,
                jobs_priority=priority)
            njobs = len([j for j in wf.jobs if isinstance(j, Job)])
            if njobs != 0:
                priority -= 100
                workflow.jobs += wf.jobs
                workflow.dependencies += wf.dependencies
                group = Group(wf.root_group,
                            name='Morphologist %s' % str(subject))
                group.user_storage = subject_id
                workflow.root_group.append(group) # += wf.root_group
                workflow.groups += [group] + wf.groups

        return workflow

    def check_missing_models(self, pipeline, missing):
        if 'SulciRecognition.SPAM_recognition09.global_recognition' in missing:
            node = missing[
                'SulciRecognition.SPAM_recognition09.global_recognition']
            model = [m[0] for m in node if m[0] == 'model']
            if model:
                raise MissingModelsError(
                    "SPAM recognition models are not installed.")

    def _build_jobid_to_step(self):
        self._jobid_to_step = {}
        workflow = self._workflow_controller.workflow(self._workflow_id)
        for group in workflow.groups:
            subjectid = group.user_storage
            if subjectid:
                self._jobid_to_step[subjectid] = BidiMap(
                    'job_id', 'step_id')
                job_list = list(group.elements)
                while job_list:
                    job = job_list.pop(0)
                    if isinstance(job, Group):
                        job_list += job.elements
                    else:
                        job_att = workflow.job_mapping.get(job)
                        if job_att:
                            job_id = job_att.job_id
                            step_id = job.user_storage or job.name
                            self._jobid_to_step[subjectid][job_id] = step_id
                        else:
                            print('job without mapping, subject: %s, job: %s'
                                  % (subjectid, job.name))

    def _define_workflow_name(self):
        return self._study.name + " " + self.WORKFLOW_NAME_SUFFIX

    def is_running(self, subject_id=None, step_id=None, update_status=True):
        status = self.get_status(subject_id, step_id, update_status)
        return status == Runner.RUNNING

    def get_running_step_ids(self, subject_id, update_status=True):
        if update_status:
            self._update_jobs_status()
        running_step_ids = self._get_subject_filtered_step_ids(
            subject_id, Runner.RUNNING)
        return running_step_ids

    def wait(self, subject_id=None, step_id=None):
        if subject_id is None and step_id is None:
            Helper.wait_workflow(
                self._workflow_id, self._workflow_controller)
        elif subject_id is not None:
            if step_id is None:
                raise NotImplementedError
            else:
                self._step_wait(subject_id, step_id)
        else:
            raise NotImplementedError
        # transfer back files, if any
        Helper.transfer_output_files(self._workflow_id,
                                     self._workflow_controller)

    def _step_wait(self, subject_id, step_id):
        job_id = self._jobid_to_step[subject_id][step_id, 'step_id']
        self._workflow_controller.wait_job([job_id])

    def has_failed(self, subject_id=None, step_id=None, update_status=True):
        status = self.get_status(subject_id, step_id, update_status)
        return (status & Runner.FAILED) or (status & Runner.ABORTED_NOTRUN)

    def get_failed_step_ids(self, subject_id, update_status=True):
        if update_status:
            self._update_jobs_status()
        failed_step_ids = self._get_subject_filtered_step_ids(
            subject_id, Runner.FAILED)
        return failed_step_ids

    def stop(self, subject_id=None, step_id=None):
        if not self.is_running():
            raise RuntimeError("Runner is not running.")
        if subject_id is None and step_id is None:
            self._workflow_stop()
        elif subject_id is not None:
            if step_id is None:
                raise NotImplementedError
            else:
                raise NotImplementedError
        else:
            raise NotImplementedError

    def _workflow_stop(self):
        self._workflow_controller.stop_workflow(self._workflow_id)

        # transfer back files, if any
        Helper.transfer_output_files(self._workflow_id,
                                     self._workflow_controller)

        interrupted_step_ids = self._get_interrupted_step_ids()
        for subject_id, step_ids in six.iteritems(interrupted_step_ids):
            if step_ids:
                analysis = self._study.analyses[subject_id]
                analysis.clear_results(step_ids)

    def _get_interrupted_step_ids(self, update_status = True):
        """ Interrupted steps are either steps with an interrupted job (killed
        or failed), or steps with both run and not-run jobs (all jobs have not
        been performed, but some of them have)
        """
        if update_status:
            self._update_jobs_status()
        filtered_step_ids_by_subject_id = {}
        for subject_id in self._jobid_to_step:
            subject_jobs = self._get_subject_jobs(subject_id)
            jobs_status = self._get_jobs_status(update_status=False)
            interrupted_step_ids = set()
            started_step_ids = set()
            notrun_step_ids = set()
            for job_id, step in six.iteritems(subject_jobs):
                if step in interrupted_step_ids:
                    continue # this one is already in the list
                job_status = jobs_status[job_id]
                if job_status & Runner.INTERRUPTED:
                    interrupted_step_ids.add(step)
                else:
                    if job_status & Runner.SUCCESS:
                        started_step_ids.add(step)
                        if step in notrun_step_ids:
                            # both started and unfinished step
                            interrupted_step_ids.add(step)
                    elif job_status & Runner.ABORTED_NOTRUN:
                        notrun_step_ids.add(step)
                        if step in started_step_ids:
                            # both started and unfinished step
                            interrupted_step_ids.add(step)
            filtered_step_ids_by_subject_id[subject_id] = interrupted_step_ids
        return filtered_step_ids_by_subject_id

    def _get_filtered_step_ids(self, status, update_status = True):
        if update_status:
            self._update_jobs_status()
        filtered_step_ids_by_subject_id = {}
        for subject_id in self._jobid_to_step:
            filtered_step_ids = self._get_subject_filtered_step_ids(
                subject_id, status)
            filtered_step_ids_by_subject_id[subject_id] = filtered_step_ids
        return filtered_step_ids_by_subject_id

    def _get_subject_filtered_step_ids(self, subject_id, status):
        step_ids = set()
        subject_jobs = self._get_subject_jobs(subject_id)
        jobs_status = self._get_jobs_status(update_status=False)
        for job_id in subject_jobs:
            job_status = jobs_status[job_id]
            if job_status & status:
                step_ids.add(subject_jobs[job_id])
        return list(step_ids)

    def get_status(self, subject_id=None, step_id=None, update_status=True):
        if self._workflow_id is None:
            status = Runner.NOT_STARTED
        elif subject_id is None and step_id is None:
            if update_status:
                self._update_jobs_status()
            status = self._get_workflow_status()
        elif subject_id is not None and step_id is None:
            status = self._get_subject_status(subject_id, update_status)
        else:
            status = self._get_step_status(subject_id, step_id, update_status)
        return status

    def _get_workflow_status(self):
        sw_status \
            = self._workflow_controller.workflow_status(self._workflow_id)
        if (sw_status in [sw.constants.WORKFLOW_IN_PROGRESS,
                          sw.constants.WORKFLOW_NOT_STARTED]):
            status = Runner.RUNNING
        else:
            has_failed = (len(Helper.list_failed_jobs(
                self._workflow_id, self._workflow_controller,
                include_aborted_jobs=True,
                include_user_killed_jobs=True)) != 0)
            if has_failed:
                status = Runner.FAILED
            else:
                status = Runner.SUCCESS
        return status

    def _get_subject_status(self, subject_id, update_status=True):
        status = Runner.NOT_STARTED
        subject_jobs = self._get_subject_jobs(subject_id)
        if subject_jobs:
            jobs_status=self._get_jobs_status(update_status)
            status = Runner.SUCCESS
            for job_id in subject_jobs:
                job_status = jobs_status[job_id]
                # XXX hypothesis: the workflow is linear for a subject (no branch)
                if job_status & (Runner.RUNNING | Runner.INTERRUPTED):
                    status = job_status
                    break
                elif job_status & Runner.UNKNOWN:
                    status = job_status
        return status

    def _get_step_status(self, subject_id, step_id, update_status=True):
        status = Runner.NOT_STARTED
        subject_jobs = self._get_subject_jobs(subject_id)
        if subject_jobs:
            # WARNING: assumes only 1 job per step. FIXME.
            job_id = subject_jobs[step_id, "step_id"]
            jobs_status = self._get_jobs_status(update_status)
            status = jobs_status[job_id]
        return status

    def _get_subject_jobs(self, subject_id):
        return self._jobid_to_step.get(subject_id, [])

    def _get_jobs_status(self, update_status=True):
        if update_status or self._cached_jobs_status is None:
            self._update_jobs_status()
        return self._cached_jobs_status

    def _update_jobs_status(self):
        jobs_status = {} # job_id -> status
        job_info_seq = self._workflow_controller.workflow_elements_status(
            self._workflow_id)[0]
        for job_info in job_info_seq:
            job_id = job_info[0]
            sw_status = job_info[1]
            exit_info = job_info[3]
            exit_status, exit_value, _, _ = exit_info
            status = self._sw_status_to_runner_status(sw_status, exit_status,
                                                      exit_value)
            jobs_status[job_id] = status
        self._cached_jobs_status = jobs_status

    def _sw_status_to_runner_status(self, sw_status, exit_status, exit_value):
        if sw_status in [sw.constants.FAILED,
                         sw.constants.DELETE_PENDING,
                         sw.constants.KILL_PENDING] or \
            (exit_value is not None and exit_value != 0):
            if exit_status == sw.constants.USER_KILLED:
                status = Runner.STOPPED_BY_USER
            elif exit_status == sw.constants.EXIT_NOTRUN:
                status = Runner.ABORTED_NOTRUN
            else:
                status = Runner.FAILED
        elif sw_status == sw.constants.DONE:
            status = Runner.SUCCESS
        # XXX status UNDERTERMINED  is supposed to be a transitory status
        # after or before the running status
        elif sw_status in [sw.constants.RUNNING, sw.constants.QUEUED_ACTIVE,
                           sw.constants.SUBMISSION_PENDING,
                           sw.constants.UNDETERMINED]:
            status = Runner.RUNNING
        elif sw_status == sw.constants.NOT_SUBMITTED:
            status = Runner.NOT_STARTED
        else:
            # WARNING, SYSTEM_ON_HOLD, USER_ON_HOLD,
            # USER_SYSTEM_ON_HOLD, SYSTEM_SUSPENDED, USER_SUSPENDED,
            # USER_SYSTEM_SUSPENDED
            status = Runner.UNKNOWN
        return status
Beispiel #11
0
s2 = LogisticRegression()
p = [("scaler", s1), ("logit", s2)]
est = Pipeline(p)

# get the iris dataset
X, y = iris.get_data()

# jsonify the method and a cross-validation scheme
method_conf = JSONify_estimator(est, out="./est.json")
cv_conf = JSONify_cv(StratifiedKFold, cv_kwargs={"n_folds": 5},
                     score_func=f1_score, stratified=True,
                     out="./cv.json")
# build the dataset file
dataset = build_dataset(X, y, method_conf, cv_conf, ".", compress=1)

# create the workflow in the internal representation
wfi = create_wf(dataset['folds'], cv_conf, method_conf, ".",
               verbose=True)
# save to soma-workflow format
wf = save_wf(wfi, "./workflow.json", mode="soma-workflow")

# create a controler and submit
controler = WorkflowController()
wf_id = controler.submit_workflow(workflow=wf, name="first example")

# wait for completion
while controler.workflow_status(wf_id) != 'workflow_done':
    time.sleep(2)
# read final result file
print(joblib.load('./final_res.pkl'))
Beispiel #12
0
from soma_workflow.client import Job, Workflow, WorkflowController

job_1 = Job(command=["sleep", "60"], name="job 1")
job_2 = Job(command=["sleep", "60"], name="job 2")
job_3 = Job(command=["sleep", "60"], name="job 3")
job_4 = Job(command=["sleep", "60"], name="job 4")

jobs = [job_1, job_2, job_3, job_4]
dependencies = [(job_1, job_2),
                (job_1, job_3),
                (job_2, job_4),
                (job_3, job_4)]

workflow = Workflow(jobs=jobs,
                    dependencies=dependencies)


controller = WorkflowController("DSV_cluster", login, password)

controller.submit_workflow(workflow=workflow,
                           name="simple example")
Beispiel #13
0
from __future__ import print_function
import time
import os

from soma_workflow.client import Job, Workflow, WorkflowController, Helper, FileTransfer
from soma_workflow.configuration import Configuration
# from soma_workflow.connection import RemoteConnection

user = '******'
try:
    import pwd
    user = pwd.getpwuid(os.getuid()).pw_name
except Exception:
    pass

controller = WorkflowController("Gabriel", user)

# FileTransfer creation for input files
file1 = FileTransfer(is_input=True,
                     client_path="%s/create_file.py" %
                     Configuration.get_home_dir(),
                     name="script")

file2 = FileTransfer(is_input=True,
                     client_path="%s/output_file" %
                     Configuration.get_home_dir(),
                     name="file created on the server")

# Job and Workflow
run_script = Job(command=["python", file1, file2],
                 name="copy",
Beispiel #14
0
echo %s
""" % test_bash_script
    fileout.write(filecontent)
    fileout.close()
    os.chdir(cur_work_dir)

    job1 = Job(command=[u"touch", test_filepath],
               name="epac_job_test",
               working_directory=tmp_work_dir_path)
    job2 = Job(command=["%s/readfile" % cur_file_dir, test_bash_script],
               name="epac_job_test",
               working_directory=tmp_work_dir_path)

    soma_workflow = Workflow(jobs=[job1, job2])

    resource_id = socket.gethostname()
    controller = WorkflowController(resource_id, "", "")
    ## run soma-workflow
    ## =================
    wf_id = controller.submit_workflow(workflow=soma_workflow,
                                       name="epac workflow")
    Helper.wait_workflow(wf_id, controller)
    nb_failed_jobs = len(Helper.list_failed_jobs(wf_id, controller))
    if nb_failed_jobs > 0:
        raise ValueError("Soma-workflow error, cannot use working directory")

    if not os.path.isfile(os.path.join(tmp_work_dir_path, test_filepath)):
        raise ValueError("Soma-workflow cannot define working directory")
    else:
        print("OK for creating new file in working directory")
group_elements = []

first_job = Job(command=["sleep", "10"], name="first job")
last_job = Job(command=["sleep", "10"], name="last job")

jobs.append(first_job)
jobs.append(last_job)

for i in range(0, 30):
    job = Job(command=["sleep", "60"], name="job " + repr(i))

    jobs.append(job)

    dependencies.append((first_job, job))
    dependencies.append((job, last_job))

    group_elements.append(job)


thirty_jobs_group = Group(elements=group_elements,
                          name="my 30 jobs")

workflow = Workflow(jobs=jobs,
                    dependencies=dependencies,
                    root_group=[first_job, thirty_jobs_group, last_job])

controller = WorkflowController("DSV_cluster", login, password)

controller.submit_workflow(workflow=workflow,
                           name="Simple workflow with group")
#f = open("/tmp/soma_workflow_examples/myfile", "wb")
#f.write("Content of my file \n")
# f.close()


# FileTransfer creation for input files
myfile = FileTransfer(is_input=True,
                      client_path="/tmp/soma_workflow_examples/myfile",
                      name="myfile")

# FileTransfer creation for output files
copy_of_myfile = FileTransfer(is_input=False,
                              client_path="/tmp/soma_workflow_examples/copy_of_myfile",
                              name="copy of my file")

# Job and Workflow
copy_job = Job(command=["cp", myfile, copy_of_myfile],
               name="copy",
               referenced_input_files=[myfile],
               referenced_output_files=[copy_of_myfile])

workflow = Workflow(jobs=[copy_job],
                    dependencies=[])


controller = WorkflowController("DSV_cluster", login, password)

controller.submit_workflow(workflow=workflow,
                           name="simple transfer")
Beispiel #17
0
    def run_test(cls, debug=False, interactive=False, **kwargs):
        sys.stdout.write("********* soma-workflow tests: %s *********\n" %
                         cls.__name__)

        config_file_path = Configuration.search_config_path()
        resource_ids = Configuration.get_configured_resources(config_file_path)

        enabled_resources = getattr(WorkflowTest, 'enabled_resources', None)
        enable_resources = []
        if not hasattr(WorkflowTest, 'resource_pass'):
            WorkflowTest.resource_pass = {}

        for resource_id in resource_ids:
            sys.stdout.write("============ Resource : " + resource_id +
                             " =================== \n")
            config = Configuration.load_from_file(resource_id,
                                                  config_file_path)

            if not interactive \
                    and ((enabled_resources is None
                          and config.get_mode() != LIGHT_MODE)
                         or (enabled_resources is not None
                             and resource_id not in enabled_resources)):
                sys.stdout.write('Resource %s is not tested in '
                                 'non-interactive mode\n' % resource_id)
                continue  # skip login/password ask
            if interactive:
                if enabled_resources is None:
                    sys.stdout.write("Do you want to test the resource "
                                     "%s (Y/n) ? " % resource_id)
                    sys.stdout.flush()
                    test_resource = sys.stdin.readline()
                    if test_resource.strip() in ['no', 'n', 'N', 'No', 'NO']:
                        # Skip the resource
                        sys.stdout.write('Resource %s is not tested \n' %
                                         resource_id)
                        sys.stdout.flush()
                        continue
                    enable_resources.append(resource_id)
                    (login, password) = get_user_id(resource_id, config)
                    WorkflowTest.resource_pass[resource_id] = (login, password)
                else:
                    if resource_id not in enabled_resources:
                        continue
                    (login, password) = WorkflowTest.resource_pass[resource_id]
            else:
                (login, password) = get_user_id(resource_id,
                                                config,
                                                interactive=interactive)

            if config.get_mode() == LIGHT_MODE:
                # use a temporary sqlite database in soma-workflow to avoid
                # concurrent access problems
                tmpdb = tempfile.mkstemp('.db', prefix='swf_')
                os.close(tmpdb[0])
                os.unlink(tmpdb[1])
                # and so on for transfers / stdio files directory
                tmptrans = tempfile.mkdtemp(prefix='swf_')
                config._database_file = tmpdb[1]
                config._transfered_file_dir = tmptrans

            wf_controller = None
            try:

                with suppress_stdout(debug):
                    wf_controller = WorkflowController(resource_id,
                                                       login,
                                                       password,
                                                       config=config)
                    cls.setup_wf_controller(wf_controller)

                allowed_config = cls.allowed_config[:]
                for configuration in cls.allowed_config:
                    if config.get_mode() != configuration[0]:
                        allowed_config.remove(configuration)
                if len(allowed_config) == 0:
                    sys.stdout.write(
                        "No tests available for the resource %s \n" %
                        resource_id)

                for configuration in allowed_config:
                    (mode, file_system) = configuration
                    sys.stdout.write(
                        "\n---------------------------------------\n")
                    sys.stdout.write("Mode : " + mode + '\n')
                    sys.stdout.write("File system : " + file_system + '\n')
                    cls.setup_path_management(file_system)

                    if file_system in (cls.SHARED_RESOURCE_PATH,
                                       cls.SHARED_TRANSFER) \
                            and not config.get_path_translation():
                        sys.stdout.write(
                            "Paths translation unavailable - not testing "
                            "this case\n")
                        sys.stdout.flush()
                        continue

                    suite_list = []
                    list_tests = []
                    for test in dir(cls):
                        prefix = "test_"
                        if len(test) < len(prefix):
                            continue
                        if test[0:len(prefix)] == prefix:
                            list_tests.append(test)

                    suite_list.append(
                        unittest.TestSuite(list(map(cls, list_tests))))
                    alltests = unittest.TestSuite(suite_list)
                    with suppress_stdout(debug):
                        res = unittest.TextTestRunner(
                            verbosity=2).run(alltests)
                    sys.stdout.flush()
                    sys.stdout.write("after test\n")

                    if len(res.errors) != 0 or len(res.failures) != 0:
                        raise RuntimeError("tests failed.")

            finally:
                sys.stdout.write("del wf_controller")
                if wf_controller:
                    wf_controller.stop_engine()
                del wf_controller
                cls.setup_wf_controller(None)  # del WorkflowController
                sys.stdout.write("deleted.")
                if config.get_mode() == LIGHT_MODE:
                    if not kwargs.get('keep_temporary', False):
                        if os.path.exists(config._database_file):
                            os.unlink(config._database_file)
                        if os.path.exists(config._database_file + '-journal'):
                            os.unlink(config._database_file + '-journal')
                        shutil.rmtree(config._transfered_file_dir)
                    else:
                        print('temporary files kept:')
                        print('databse file:', config._database_file)
                        print('transfers:', config._transfered_file_dir)

        if interactive and enabled_resources is None:
            print('set enabled_resources')
            WorkflowTest.enabled_resources = enable_resources
Beispiel #18
0
        ],
        name="Merging all the r2 and distribution respectively together.",
        working_directory=scripts_path)

    jobs.append(job_merge)

    # Plotting the maps
    job_final = Job(command=[
        "python", "create_maps.py", "--input", derivatives_path,
        "--parameters", parameters_path, "--subject", args.subject,
        "--fmri_data", fmri_path
    ],
                    name="Creating the maps.",
                    working_directory=scripts_path)
    jobs.append(job_final)
    dependencies.append((job_merge, job_final))

    workflow = Workflow(jobs=jobs, dependencies=dependencies)

    Helper.serialize(os.path.join(inputs_path, 'delete.somawf'), workflow)

    ### Submit the workflow to computing resource (configured in the client-server mode)

    controller = WorkflowController(
        "DSV_cluster_{}".format(login), login,
        password)  #"DSV_cluster_ap259944", login, password

    workflow_id = controller.submit_workflow(workflow=workflow,
                                             name="Ridge - LPP")

    print("Finished !!!")
Beispiel #19
0
p = [("scaler", s1), ("logit", s2)]
est = Pipeline(p)

# get the iris dataset
X, y = iris.get_data()

# jsonify the method and a cross-validation scheme
method_conf = JSONify_estimator(est, out="./est.json")
cv_conf = JSONify_cv(StratifiedKFold,
                     cv_kwargs={"n_folds": 5},
                     score_func=f1_score,
                     stratified=True,
                     out="./cv.json")
# build the dataset file
dataset = build_dataset(X, y, method_conf, cv_conf, ".", compress=1)

# create the workflow in the internal representation
wfi = create_wf(dataset['folds'], cv_conf, method_conf, ".", verbose=True)
# save to soma-workflow format
wf = save_wf(wfi, "./workflow.json", mode="soma-workflow")

# create a controler and submit
controler = WorkflowController()
wf_id = controler.submit_workflow(workflow=wf, name="first example")

# wait for completion
while controler.workflow_status(wf_id) != 'workflow_done':
    time.sleep(2)
# read final result file
print(joblib.load('./final_res.pkl'))
Beispiel #20
0
def run_soma_workflow(treatments,
                      exec_cmd,
                      tmp_local_dirs,
                      server_id,
                      remote_host,
                      remote_user,
                      remote_pathes,
                      local_result_pathes,
                      label_for_cluster,
                      wait_ending=False):
    """Dispatch treatments using soma-workflow.

    Parameters
    ----------
    treatments
        it is a dict mapping a treatment name to a treatment object
    exec_cmd
        it is the command to run on each ROI data.
    tmp_local_dirs
        it is a dict mapping a treatment name to a local tmp dir (used to store a temporary configuration file)
    server_id
        it is the server ID as expected by WorkflowController
    remote_host
        it is the remote machine where treatments are treated in parallel
    remote_user
        it is used to log in remote_host
    remote_pathes
        it is a dict mapping a treatment name to an existing remote dir which will be used to store ROI data and result
        files
    local_result_pathes
        it is a dict mapping a treatment name to a local path where final results will be sorted (host will send it
        there by scp)
    label_for_cluster
        it is the base name used to label workflows and sub jobs
    """

    import getpass
    from socket import gethostname

    local_user = getpass.getuser()
    local_host = gethostname()

    all_nodes = []
    all_deps = []
    all_groups = []
    split_jobs = []
    for t_id, treatment in treatments.iteritems():

        tmp_local_dir = tmp_local_dirs[t_id]
        remote_path = remote_pathes[t_id]
        local_result_path = local_result_pathes[t_id]

        sj, n, d, g = prepare_treatment_jobs(
            treatment, tmp_local_dir, local_result_path, local_user,
            local_host, remote_host, remote_user, remote_path,
            label_for_cluster + '-' + str(t_id))
        all_nodes.extend(n)
        all_deps.extend(d)
        all_groups.append(g)
        split_jobs.append(sj)

    # Jobs for data splitting should be done sequentially.
    # If they're done in parallel, they may flood the remote file system
    for isj in xrange(len(split_jobs)):
        if isj + 1 < len(split_jobs):
            all_deps.append((split_jobs[isj], split_jobs[isj + 1]))

    # # Be sure that all splitting jobs are done first:
    # # Is there a better way ?
    # for n in all_nodes:
    #     for sjob in split_jobs:
    #         all_deps.append((sjob,n))
    # Does not seem to work well -> maybe to many deps ?

    workflow = Workflow(all_nodes + split_jobs,
                        all_deps,
                        root_group=all_groups)

    # f = open('/tmp/workflow.pck','w')
    # cPickle.dump(workflow, f)
    # f.close()

    logger.info('Open connection ...')
    connection = WorkflowController(server_id, remote_user)

    logger.info('Submit workflow ...')
    wf_id = connection.submit_workflow(
        workflow=workflow,
        # expiration_date="",
        # queue="run32",
        name=label_for_cluster + '-' + local_user)
    #wf = connection.workflow(wf_id)

    if wait_ending:  # wait for result
        logger.info('Wait for workflow to end and make outputs ...')
        Helper.wait_workflow(wf_id, connection)

        for t_id, local_result_path in local_result_pathes.iteritems():
            treatment = treatments[t_id]
            rfilename = treatment.result_dump_file
            if rfilename is None:
                rfilename = 'result.pck'
            local_result_file = op.join(local_result_path,
                                        op.basename(rfilename))

            if not op.exists(local_result_file):
                raise Exception('Local result does not exist "%s"' %
                                local_result_file)

        if treatment.analyser.outFile is not None:
            # return result only for last treatment ...
            print 'Load result from %s ...' % local_result_file
            if splitext(local_result_file)[1] == '.gz':
                import gzip
                fresult = gzip.open(local_result_file)
            else:
                fresult = open(local_result_file)
            results = cPickle.load(fresult)
            fresult.close()
            # print 'Make outputs ...'
            #treatment.output(results, dump=False)
            logger.info('Cleaning tmp dirs ...')
            for tmp_dir in tmp_local_dirs.itervalues():
                shutil.rmtree(tmp_dir)

            return results
    else:
        logger.info('Cleaning tmp dirs ...')
        for tmp_dir in tmp_local_dirs.itervalues():
            shutil.rmtree(tmp_dir)

        logger.info('Workflow sent, returning ...')
        return []
Beispiel #21
0
    def run(self, **Xy):
        '''Run soma-workflow without gui

        Example
        -------

        >>> from sklearn import datasets
        >>> from epac.map_reduce.engine import SomaWorkflowEngine
        >>> from epac.tests.wfexamples2test import WFExample2

        >>> ## Build dataset
        >>> ## =============
        >>> X, y = datasets.make_classification(n_samples=10,
        ...                                     n_features=20,
        ...                                     n_informative=5,
        ...                                     random_state=1)
        >>> Xy = {'X':X, 'y':y}

        >>> ## Build epac tree
        >>> ## ===============
        >>> tree_root_node = WFExample2().get_workflow()

        >>> ## Build SomaWorkflowEngine and run function for each node
        >>> ## =======================================================
        >>> sfw_engine = SomaWorkflowEngine(tree_root=tree_root_node,
        ...                                 function_name="transform",
        ...                                 num_processes=3,
                                            remove_finished_wf=False)
        >>> tree_root_node = sfw_engine.run(**Xy)
        light mode
        >>> ## Run reduce process
        >>> ## ==================
        >>> tree_root_node.reduce()
        ResultSet(
        [{'key': SelectKBest/SVC(C=1), 'y/test/score_f1': [ 0.6  0.6], 'y/test/score_recall_mean/pval': [ 0.5], 'y/test/score_recall/pval': [ 0.   0.5], 'y/test/score_accuracy/pval': [ 0.], 'y/test/score_f1/pval': [ 0.   0.5], 'y/test/score_precision/pval': [ 0.5  0. ], 'y/test/score_precision': [ 0.6  0.6], 'y/test/score_recall': [ 0.6  0.6], 'y/test/score_accuracy': 0.6, 'y/test/score_recall_mean': 0.6},
         {'key': SelectKBest/SVC(C=3), 'y/test/score_f1': [ 0.6  0.6], 'y/test/score_recall_mean/pval': [ 0.5], 'y/test/score_recall/pval': [ 0.   0.5], 'y/test/score_accuracy/pval': [ 0.], 'y/test/score_f1/pval': [ 0.   0.5], 'y/test/score_precision/pval': [ 0.5  0. ], 'y/test/score_precision': [ 0.6  0.6], 'y/test/score_recall': [ 0.6  0.6], 'y/test/score_accuracy': 0.6, 'y/test/score_recall_mean': 0.6}])

        '''
        try:
            from soma_workflow.client import Job, Workflow
            from soma_workflow.client import Helper, FileTransfer
            from soma_workflow.client import WorkflowController
        except ImportError:
            errmsg = "No soma-workflow is found. "\
                "Please verify your soma-worklow"\
                "on your computer (e.g. PYTHONPATH) \n"
            sys.stderr.write(errmsg)
            sys.stdout.write(errmsg)
            raise NoSomaWFError
        tmp_work_dir_path = tempfile.mkdtemp()
        cur_work_dir = os.getcwd()
        os.chdir(tmp_work_dir_path)
        is_run_local = False
        if not self.resource_id or self.resource_id == "":
            self.resource_id = socket.gethostname()
            is_run_local = True
        # print "is_run_local=", is_run_local
        if not is_run_local:
            ft_working_directory = FileTransfer(is_input=True,
                                                client_path=tmp_work_dir_path,
                                                name="working directory")
        else:
            ft_working_directory = tmp_work_dir_path

        ## Save the database and tree to working directory
        ## ===============================================
        # np.savez(os.path.join(tmp_work_dir_path,
        # SomaWorkflowEngine.dataset_relative_path), **Xy)
        save_dataset(SomaWorkflowEngine.dataset_relative_path, **Xy)
        store = StoreFs(dirpath=os.path.join(
            tmp_work_dir_path, SomaWorkflowEngine.tree_root_relative_path))
        self.tree_root.save_tree(store=store)

        ## Subtree job allocation on disk
        ## ==============================
        node_input = NodesInput(self.tree_root.get_key())
        split_node_input = SplitNodesInput(self.tree_root,
                                           num_processes=self.num_processes)
        nodesinput_list = split_node_input.split(node_input)
        keysfile_list = save_job_list(tmp_work_dir_path, nodesinput_list)

        ## Build soma-workflow
        ## ===================
        jobs = self._create_jobs(keysfile_list, is_run_local,
                                 ft_working_directory)
        soma_workflow = Workflow(jobs=jobs)

        controller = WorkflowController(self.resource_id, self.login, self.pw)
        ## run soma-workflow
        ## =================
        wf_id = controller.submit_workflow(workflow=soma_workflow,
                                           name="epac workflow",
                                           queue=self.queue)
        Helper.transfer_input_files(wf_id, controller)
        Helper.wait_workflow(wf_id, controller)
        Helper.transfer_output_files(wf_id, controller)

        self.engine_info = self.get_engine_info(controller, wf_id)

        if self.remove_finished_wf:
            controller.delete_workflow(wf_id)
        ## read result tree
        ## ================
        self.tree_root = store.load()
        os.chdir(cur_work_dir)
        if os.path.isdir(tmp_work_dir_path) and self.remove_local_tree:
            shutil.rmtree(tmp_work_dir_path)
        return self.tree_root
Beispiel #22
0
def remote_map_marshal(func, largs=None, lkwargs=None, mode='local'):

    if largs is None:
        if lkwargs is not None:
            largs = [[]] * len(lkwargs)
        else:
            largs = []

    if lkwargs is None:
        lkwargs = [{}] * len(largs)

    lkwargs = [merge_default_kwargs(func, kw) for kw in lkwargs]

    assert len(lkwargs) == len(largs)

    all_args = zip(largs, lkwargs)

    if mode == 'local':
        return [func(*args, **kwargs) for args, kwargs in all_args]
    elif mode == 'local_with_dumps':

        func_fn = './func.marshal'
        dump_func(func, func_fn)
        results = []
        for i, params in enumerate(all_args):
            print 'params:', params
            params_fn = 'params_%d.pck' % i
            fparams = open(params_fn, 'wb')
            cPickle.dump(params, fparams)
            fparams.close()
            output_fn = 'output_%d.pck' % i
            print 'call subprocess ...'
            subprocess.call(
                ['python', '-c', cfunc_marshal, params_fn, func_fn, output_fn])
            print 'Read outputs ...'
            fout = open(output_fn)
            results.append(cPickle.load(fout))
        return results
    elif mode == 'remote_cluster':
        # FileTransfer creation for input files
        #data_dir = './rmap_data'
        data_dir = mkdtemp(prefix="sw_rmap")
        func_fn = op.join(data_dir, 'func.marshal')
        dump_func(func, func_fn)
        func_file = FileTransfer(is_input=True,
                                 client_path=func_fn,
                                 name="func_file")

        all_jobs = []
        param_files = []
        for i, params in enumerate(all_args):
            params_fn = op.join(data_dir, 'params_%d.pck' % i)
            fparams = open(params_fn, 'wb')
            cPickle.dump(params, fparams)
            fparams.close()
            param_file = FileTransfer(is_input=True,
                                      client_path=params_fn,
                                      name='params_file_%d' % i)
            param_files.append(param_file)
            output_fn = op.join(data_dir, 'output_%d.pck' % i)
            output_file = FileTransfer(is_input=False,
                                       client_path=output_fn,
                                       name='output_file_%d' % i)
            job = Job(command=[
                'python', '-c', cfunc, param_file, func_file, output_file
            ],
                      name="rmap, item %d" % i,
                      referenced_input_files=[func_file, param_file],
                      referenced_output_files=[output_file])
            all_jobs.append(job)

        workflow = Workflow(jobs=all_jobs, dependencies=[])
        # submit the workflow
        cfg = pyhrf.cfg['parallel-cluster']
        controller = WorkflowController(cfg['server_id'], cfg['user'])

        # controller.transfer_files(fids_to_transfer)
        wf_id = controller.submit_workflow(workflow=workflow,
                                           name="remote_map")

        Helper.transfer_input_files(wf_id, controller)

        Helper.wait_workflow(wf_id, controller)

        Helper.transfer_output_files(wf_id, controller)

        results = []
        for i in xrange(len(all_args)):
            fout = open(op.join(data_dir, 'output_%d.pck' % i))
            results.append(cPickle.load(fout))
            fout.close()
        return results
Beispiel #23
0
# f.write("Content of my file \n")
# f.close()


# FileTransfer creation for input files
myfile = FileTransfer(is_input=True,
                      client_path="/tmp/soma_workflow_examples/myfile",
                      name="myfile")

# FileTransfer creation for output files
copy_of_myfile = FileTransfer(is_input=False,
                              client_path="/tmp/soma_workflow_examples/copy_of_myfile",
                              name="copy of my file")

# Job and Workflow
copy_job = Job(command=["cp", myfile, copy_of_myfile],
               name="copy",
               referenced_input_files=[myfile],
               referenced_output_files=[copy_of_myfile])

workflow = Workflow(jobs=[copy_job],
                    dependencies=[])


login = '******'
password = '******'
controller = WorkflowController("DSV_cluster", login, password)

controller.submit_workflow(workflow=workflow,
                           name="simple transfer")
Beispiel #24
0
def remote_map(func, largs=None, lkwargs=None, mode='serial'):
    """
    Execute a function in parallel on a list of arguments.

    Args:
        *func* (function): function to apply on each item.
                           **this function must be importable on the remote side**
        *largs* (list of tuple): each item in the list is a tuple
                                 containing all positional argument values of the
                                 function
        *lkwargs* (list of dict): each item in the list is a dict
                                  containing all named arguments of the
                                  function mapped to their value.

        *mode* (str): indicates how execution is distributed. Choices are:

            - "serial": single-thread loop on the local machine
            - "local" : use joblib to run tasks in parallel.
                        The number of simultaneous jobs is defined in
                        the configuration section ['parallel-local']['nb_procs']
                        see ~/.pyhrf/config.cfg
            - "remote_cluster: use somaworkflow to run tasks in parallel.
                               The connection setup has to be defined
                               in the configuration section ['parallel-cluster']
                               of ~/.pyhrf/config.cfg.
            - "local_with_dumps": testing purpose only, run each task serially as
                                  a subprocess.

    Returns:
         a list of results

    Raises:
         RemoteException if any remote task has failed

    Example:
    >>> from pyhrf.parallel import remote_map
    >>> def foo(a, b=2): \
        return a + b
    >>> remote_map(foo, [(2,),(3,)], [{'b':5}, {'b':7}])
    [7, 10]
    """
    if largs is None:
        if lkwargs is not None:
            largs = [tuple()] * len(lkwargs)
        else:
            largs = [tuple()]

    if lkwargs is None:
        lkwargs = [{}] * len(largs)

    lkwargs = [merge_default_kwargs(func, kw) for kw in lkwargs]

    assert len(lkwargs) == len(largs)

    all_args = zip(largs, lkwargs)
    # print 'all_args:', all_args

    fmodule = func.__module__
    fname = '.'.join([fmodule, func.__name__])

    if mode == 'serial':
        return [func(*args, **kwargs) for args, kwargs in all_args]
    elif mode == 'local':
        try:
            from joblib import Parallel, delayed
        except ImportError:
            raise ImportError('Can not import joblib. It is '
                              'required to enable parallel '
                              'processing on a local machine.')

        if logger.getEffectiveLevel() == logging.DEBUG:
            parallel_verb = 10
        else:
            parallel_verb = 0
        if pyhrf.cfg['parallel-local']['nb_procs']:
            n_jobs = pyhrf.cfg['parallel-local']['nb_procs']
        else:
            n_jobs = available_cpu_count()
        p = Parallel(n_jobs=n_jobs, verbose=parallel_verb)
        return p(delayed(func)(*args, **kwargs) for args, kwargs in all_args)

    elif mode == 'local_with_dumps':
        results = []
        for i, params in enumerate(all_args):
            # print 'params:', params
            params_fn = 'params_%d.pck' % i
            fparams = open(params_fn, 'wb')
            cPickle.dump(params, fparams)
            fparams.close()
            output_fn = 'output_%d.pck' % i
            # print 'call subprocess ...'
            subprocess.call([
                'python', '-c', cfunc % (fmodule, fname), params_fn, output_fn
            ])
            # print 'Read outputs ...'
            fout = open(output_fn)
            results.append(cPickle.load(fout))
        return results
    elif mode == 'remote_cluster':
        # FileTransfer creation for input files
        #data_dir = './rmap_data'
        data_dir = mkdtemp(prefix="sw_rmap")

        all_jobs = []
        param_files = []
        for i, params in enumerate(all_args):
            params_fn = op.join(data_dir, 'params_%d.pck' % i)
            fparams = open(params_fn, 'wb')
            cPickle.dump(params, fparams)
            fparams.close()
            param_file = FileTransfer(is_input=True,
                                      client_path=params_fn,
                                      name='params_file_%d' % i)
            param_files.append(param_file)
            output_fn = op.join(data_dir, 'output_%d.pck' % i)
            output_file = FileTransfer(is_input=False,
                                       client_path=output_fn,
                                       name='output_file_%d' % i)
            job = Job(command=[
                'pyhrf_exec_pyfunc', fmodule, fname, param_file, output_file
            ],
                      name="rmap, item %d" % i,
                      referenced_input_files=[param_file],
                      referenced_output_files=[output_file])
            all_jobs.append(job)

        workflow = Workflow(jobs=all_jobs, dependencies=[])
        # submit the workflow
        cfg = pyhrf.cfg['parallel-cluster']
        controller = WorkflowController(cfg['server_id'], cfg['user'])
        # controller.transfer_files(fids_to_transfer)
        wf_id = controller.submit_workflow(workflow=workflow,
                                           name="remote_map")

        Helper.transfer_input_files(wf_id, controller)

        Helper.wait_workflow(wf_id, controller)

        Helper.transfer_output_files(wf_id, controller)

        results = []
        for i in xrange(len(all_args)):
            fnout = op.join(data_dir, 'output_%d.pck' % i)
            fout = open(fnout)
            o = cPickle.load(fout)
            print 'file cPickle loaded:', o
            fout.close()
            os.remove(fnout)
            if isinstance(o, Exception):
                raise RemoteException('Task %d failed' % i, o)
                if o.errno != 17:
                    raise RemoteException('Task %d failed' % i, o)
            results.append(o)
        return results
f.close()

# Creation of the FileTransfer object to transfer the working directory
my_working_directory = FileTransfer(is_input=True,
                                    client_path="/tmp/my_working_directory",
                                    name="working directory")

# Jobs and Workflow
job1 = Job(command=["cp", "myfile1", "copy_of_myfile1"],
           name="job1",
           referenced_input_files=[my_working_directory],
           referenced_output_files=[my_working_directory],
           working_directory=my_working_directory)

job2 = Job(command=["cp", "myfile2", "copy_of_myfile2"],
           name="job2",
           referenced_input_files=[my_working_directory],
           referenced_output_files=[my_working_directory],
           working_directory=my_working_directory)

workflow = Workflow(jobs=[job1, job2],
                    dependencies=[])

# Submit the workflow
print("password? ")
password = getpass.getpass()
controller = WorkflowController("DSV_cluster", login, password)

controller.submit_workflow(workflow=workflow,
                           name="working directory transfer example")
Beispiel #26
0
    jobs += group_score + group_significativity + group_merge
    jobs.append(job_final)

    scores = Group(elements=group_score,
                   name="group where test scores are calculated")

    significativity = Group(
        elements=group_significativity,
        name="group where distributions are calculated for significance")

    merge = Group(elements=group_merge, name="group where we merge results")

    workflow = Workflow(jobs=jobs,
                        dependencies=dependencies,
                        root_group=[scores, significativity, merge, job_final])

    Helper.serialize(
        os.path.join(inputs_path, 'optimized_cluster_part_2.somawf'), workflow)

    ### Submit the workflow to computing resource (configured in the client-server mode)

    controller = WorkflowController(
        "DSV_cluster_{}".format(login), login,
        password)  #"DSV_cluster_ap259944", login, password

    workflow_id = controller.submit_workflow(workflow=workflow,
                                             name="Cluster optimized part 2")

    print("Finished !!!")
Beispiel #27
0
def run_soma_workflow(treatments, exec_cmd, tmp_local_dirs, server_id,
                      remote_host, remote_user, remote_pathes,
                      local_result_pathes, label_for_cluster,
                      wait_ending=False):
    """Dispatch treatments using soma-workflow.

    Parameters
    ----------
    treatments
        it is a dict mapping a treatment name to a treatment object
    exec_cmd
        it is the command to run on each ROI data.
    tmp_local_dirs
        it is a dict mapping a treatment name to a local tmp dir (used to store a temporary configuration file)
    server_id
        it is the server ID as expected by WorkflowController
    remote_host
        it is the remote machine where treatments are treated in parallel
    remote_user
        it is used to log in remote_host
    remote_pathes
        it is a dict mapping a treatment name to an existing remote dir which will be used to store ROI data and result
        files
    local_result_pathes
        it is a dict mapping a treatment name to a local path where final results will be sorted (host will send it
        there by scp)
    label_for_cluster
        it is the base name used to label workflows and sub jobs
    """

    import getpass
    from socket import gethostname

    local_user = getpass.getuser()
    local_host = gethostname()

    all_nodes = []
    all_deps = []
    all_groups = []
    split_jobs = []
    for t_id, treatment in treatments.iteritems():

        tmp_local_dir = tmp_local_dirs[t_id]
        remote_path = remote_pathes[t_id]
        local_result_path = local_result_pathes[t_id]

        sj, n, d, g = prepare_treatment_jobs(treatment, tmp_local_dir,
                                             local_result_path,
                                             local_user, local_host,
                                             remote_host,
                                             remote_user, remote_path,
                                             label_for_cluster + '-' + str(t_id))
        all_nodes.extend(n)
        all_deps.extend(d)
        all_groups.append(g)
        split_jobs.append(sj)

    # Jobs for data splitting should be done sequentially.
    # If they're done in parallel, they may flood the remote file system
    for isj in xrange(len(split_jobs)):
        if isj + 1 < len(split_jobs):
            all_deps.append((split_jobs[isj], split_jobs[isj + 1]))

    # # Be sure that all splitting jobs are done first:
    # # Is there a better way ?
    # for n in all_nodes:
    #     for sjob in split_jobs:
    #         all_deps.append((sjob,n))
    # Does not seem to work well -> maybe to many deps ?

    workflow = Workflow(
        all_nodes + split_jobs, all_deps, root_group=all_groups)

    # f = open('/tmp/workflow.pck','w')
    # cPickle.dump(workflow, f)
    # f.close()

    logger.info('Open connection ...')
    connection = WorkflowController(server_id, remote_user)

    logger.info('Submit workflow ...')
    wf_id = connection.submit_workflow(workflow=workflow,
                                       # expiration_date="",
                                       # queue="run32",
                                       name=label_for_cluster + '-' +
                                       local_user)
    #wf = connection.workflow(wf_id)

    if wait_ending:  # wait for result
        logger.info('Wait for workflow to end and make outputs ...')
        Helper.wait_workflow(wf_id, connection)

        for t_id, local_result_path in local_result_pathes.iteritems():
            treatment = treatments[t_id]
            rfilename = treatment.result_dump_file
            if rfilename is None:
                rfilename = 'result.pck'
            local_result_file = op.join(local_result_path,
                                        op.basename(rfilename))

            if not op.exists(local_result_file):
                raise Exception('Local result does not exist "%s"'
                                % local_result_file)

        if treatment.analyser.outFile is not None:
            # return result only for last treatment ...
            print 'Load result from %s ...' % local_result_file
            if splitext(local_result_file)[1] == '.gz':
                import gzip
                fresult = gzip.open(local_result_file)
            else:
                fresult = open(local_result_file)
            results = cPickle.load(fresult)
            fresult.close()
            # print 'Make outputs ...'
            #treatment.output(results, dump=False)
            logger.info('Cleaning tmp dirs ...')
            for tmp_dir in tmp_local_dirs.itervalues():
                shutil.rmtree(tmp_dir)

            return results
    else:
        logger.info('Cleaning tmp dirs ...')
        for tmp_dir in tmp_local_dirs.itervalues():
            shutil.rmtree(tmp_dir)

        logger.info('Workflow sent, returning ...')
        return []
Beispiel #28
0
echo %s
""" % test_bash_script
    fileout.write(filecontent)
    fileout.close()
    os.chdir(cur_work_dir)

    job1 = Job(command=[u"touch", test_filepath],
               name="epac_job_test",
               working_directory=tmp_work_dir_path)
    job2 = Job(command=["%s/readfile" % cur_file_dir, test_bash_script],
               name="epac_job_test",
               working_directory=tmp_work_dir_path)

    soma_workflow = Workflow(jobs=[job1, job2])

    resource_id = socket.gethostname()
    controller = WorkflowController(resource_id, "", "")
    ## run soma-workflow
    ## =================
    wf_id = controller.submit_workflow(workflow=soma_workflow,
                                       name="epac workflow")
    Helper.wait_workflow(wf_id, controller)
    nb_failed_jobs = len(Helper.list_failed_jobs(wf_id, controller))
    if nb_failed_jobs > 0:
        raise ValueError("Soma-workflow error, cannot use working directory")

    if not os.path.isfile(os.path.join(tmp_work_dir_path, test_filepath)):
        raise ValueError("Soma-workflow cannot define working directory")
    else:
        print "OK for creating new file in working directory"
Beispiel #29
0
def remote_map(func, largs=None, lkwargs=None, mode='serial'):
    """
    Execute a function in parallel on a list of arguments.

    Args:
        *func* (function): function to apply on each item.
                           **this function must be importable on the remote side**
        *largs* (list of tuple): each item in the list is a tuple
                                 containing all positional argument values of the
                                 function
        *lkwargs* (list of dict): each item in the list is a dict
                                  containing all named arguments of the
                                  function mapped to their value.

        *mode* (str): indicates how execution is distributed. Choices are:

            - "serial": single-thread loop on the local machine
            - "local" : use joblib to run tasks in parallel.
                        The number of simultaneous jobs is defined in
                        the configuration section ['parallel-local']['nb_procs']
                        see ~/.pyhrf/config.cfg
            - "remote_cluster: use somaworkflow to run tasks in parallel.
                               The connection setup has to be defined
                               in the configuration section ['parallel-cluster']
                               of ~/.pyhrf/config.cfg.
            - "local_with_dumps": testing purpose only, run each task serially as
                                  a subprocess.

    Returns:
         a list of results

    Raises:
         RemoteException if any remote task has failed

    Example:
    >>> from pyhrf.parallel import remote_map
    >>> def foo(a, b=2): \
        return a + b
    >>> remote_map(foo, [(2,),(3,)], [{'b':5}, {'b':7}])
    [7, 10]
    """
    if largs is None:
        if lkwargs is not None:
            largs = [tuple()] * len(lkwargs)
        else:
            largs = [tuple()]

    if lkwargs is None:
        lkwargs = [{}] * len(largs)

    lkwargs = [merge_default_kwargs(func, kw) for kw in lkwargs]

    assert len(lkwargs) == len(largs)

    all_args = zip(largs, lkwargs)
    # print 'all_args:', all_args

    fmodule = func.__module__
    fname = '.'.join([fmodule, func.__name__])

    if mode == 'serial':
        return [func(*args, **kwargs) for args, kwargs in all_args]
    elif mode == 'local':
        try:
            from joblib import Parallel, delayed
        except ImportError:
            raise ImportError('Can not import joblib. It is '
                              'required to enable parallel '
                              'processing on a local machine.')

        if logger.getEffectiveLevel() == logging.DEBUG:
            parallel_verb = 10
        else:
            parallel_verb = 0
        if pyhrf.cfg['parallel-local']['nb_procs']:
            n_jobs = pyhrf.cfg['parallel-local']['nb_procs']
        else:
            n_jobs = available_cpu_count()
        p = Parallel(n_jobs=n_jobs, verbose=parallel_verb)
        return p(delayed(func)(*args, **kwargs)
                 for args, kwargs in all_args)

    elif mode == 'local_with_dumps':
        results = []
        for i, params in enumerate(all_args):
            # print 'params:', params
            params_fn = 'params_%d.pck' % i
            fparams = open(params_fn, 'wb')
            cPickle.dump(params, fparams)
            fparams.close()
            output_fn = 'output_%d.pck' % i
            # print 'call subprocess ...'
            subprocess.call(['python', '-c', cfunc % (fmodule, fname),
                             params_fn, output_fn])
            # print 'Read outputs ...'
            fout = open(output_fn)
            results.append(cPickle.load(fout))
        return results
    elif mode == 'remote_cluster':
        # FileTransfer creation for input files
        #data_dir = './rmap_data'
        data_dir = mkdtemp(prefix="sw_rmap")

        all_jobs = []
        param_files = []
        for i, params in enumerate(all_args):
            params_fn = op.join(data_dir, 'params_%d.pck' % i)
            fparams = open(params_fn, 'wb')
            cPickle.dump(params, fparams)
            fparams.close()
            param_file = FileTransfer(is_input=True,
                                      client_path=params_fn,
                                      name='params_file_%d' % i)
            param_files.append(param_file)
            output_fn = op.join(data_dir, 'output_%d.pck' % i)
            output_file = FileTransfer(is_input=False,
                                       client_path=output_fn,
                                       name='output_file_%d' % i)
            job = Job(command=['pyhrf_exec_pyfunc', fmodule, fname,
                               param_file, output_file],
                      name="rmap, item %d" % i,
                      referenced_input_files=[param_file],
                      referenced_output_files=[output_file])
            all_jobs.append(job)

        workflow = Workflow(jobs=all_jobs, dependencies=[])
        # submit the workflow
        cfg = pyhrf.cfg['parallel-cluster']
        controller = WorkflowController(cfg['server_id'], cfg['user'])
        # controller.transfer_files(fids_to_transfer)
        wf_id = controller.submit_workflow(
            workflow=workflow, name="remote_map")

        Helper.transfer_input_files(wf_id, controller)

        Helper.wait_workflow(wf_id, controller)

        Helper.transfer_output_files(wf_id, controller)

        results = []
        for i in xrange(len(all_args)):
            fnout = op.join(data_dir, 'output_%d.pck' % i)
            fout = open(fnout)
            o = cPickle.load(fout)
            print 'file cPickle loaded:', o
            fout.close()
            os.remove(fnout)
            if isinstance(o, Exception):
                raise RemoteException('Task %d failed' % i, o)
                if o.errno != 17:
                    raise RemoteException('Task %d failed' % i, o)
            results.append(o)
        return results
Beispiel #30
0
from soma_workflow.client import Job, Workflow, WorkflowController, SharedResourcePath, FileTransfer

# SharedResourcePath creation for the input file.
# The input file is read direclty in the data directory located on the
# the computing resource side.
myfile = SharedResourcePath(relative_path="myfile",
                            namespace="MyApp",
                            uuid="my_example_dir")

# FileTransfer creation for the output file.
# That way the output file will not be written in the data directory
# located on the computing resource file system.
copy_of_myfile = FileTransfer(is_input=False,
                              client_path="/tmp/soma_workflow_examples/copy_of_myfile",
                              name="copy of my file")

# Job and Workflow creation
copy_job = Job(command=["cp", myfile, copy_of_myfile],
               name="copy",
               referenced_input_files=[],
               referenced_output_files=[copy_of_myfile])

workflow = Workflow(jobs=[copy_job],
                    dependencies=[])

# workflow submission
controller = WorkflowController("DSV_cluster", login, password)

controller.submit_workflow(workflow=workflow,
                           name="shared resource path example")
Beispiel #31
0
    def run(self, **Xy):
        """Run soma-workflow without gui

        Example
        -------

        >>> from sklearn import datasets
        >>> from epac.map_reduce.engine import SomaWorkflowEngine
        >>> from epac.tests.wfexamples2test import WFExample2

        >>> ## Build dataset
        >>> ## =============
        >>> X, y = datasets.make_classification(n_samples=10,
        ...                                     n_features=20,
        ...                                     n_informative=5,
        ...                                     random_state=1)
        >>> Xy = {'X':X, 'y':y}

        >>> ## Build epac tree
        >>> ## ===============
        >>> tree_root_node = WFExample2().get_workflow()

        >>> ## Build SomaWorkflowEngine and run function for each node
        >>> ## =======================================================
        >>> sfw_engine = SomaWorkflowEngine(tree_root=tree_root_node,
        ...                                 function_name="transform",
        ...                                 num_processes=3,
                                            remove_finished_wf=False)
        >>> tree_root_node = sfw_engine.run(**Xy)
        light mode
        >>> ## Run reduce process
        >>> ## ==================
        >>> tree_root_node.reduce()
        ResultSet(
        [{'key': SelectKBest/SVC(C=1), 'y/test/score_f1': [ 0.6  0.6], 'y/test/score_recall_mean/pval': [ 0.5], 'y/test/score_recall/pval': [ 0.   0.5], 'y/test/score_accuracy/pval': [ 0.], 'y/test/score_f1/pval': [ 0.   0.5], 'y/test/score_precision/pval': [ 0.5  0. ], 'y/test/score_precision': [ 0.6  0.6], 'y/test/score_recall': [ 0.6  0.6], 'y/test/score_accuracy': 0.6, 'y/test/score_recall_mean': 0.6},
         {'key': SelectKBest/SVC(C=3), 'y/test/score_f1': [ 0.6  0.6], 'y/test/score_recall_mean/pval': [ 0.5], 'y/test/score_recall/pval': [ 0.   0.5], 'y/test/score_accuracy/pval': [ 0.], 'y/test/score_f1/pval': [ 0.   0.5], 'y/test/score_precision/pval': [ 0.5  0. ], 'y/test/score_precision': [ 0.6  0.6], 'y/test/score_recall': [ 0.6  0.6], 'y/test/score_accuracy': 0.6, 'y/test/score_recall_mean': 0.6}])

        """
        try:
            from soma_workflow.client import Job, Workflow
            from soma_workflow.client import Helper, FileTransfer
            from soma_workflow.client import WorkflowController
        except ImportError:
            errmsg = (
                "No soma-workflow is found. " "Please verify your soma-worklow" "on your computer (e.g. PYTHONPATH) \n"
            )
            sys.stderr.write(errmsg)
            sys.stdout.write(errmsg)
            raise NoSomaWFError
        tmp_work_dir_path = tempfile.mkdtemp()
        cur_work_dir = os.getcwd()
        os.chdir(tmp_work_dir_path)
        is_run_local = False
        if not self.resource_id or self.resource_id == "":
            self.resource_id = socket.gethostname()
            is_run_local = True
        # print "is_run_local=", is_run_local
        if not is_run_local:
            ft_working_directory = FileTransfer(is_input=True, client_path=tmp_work_dir_path, name="working directory")
        else:
            ft_working_directory = tmp_work_dir_path

        ## Save the database and tree to working directory
        ## ===============================================
        # np.savez(os.path.join(tmp_work_dir_path,
        # SomaWorkflowEngine.dataset_relative_path), **Xy)
        save_dataset(SomaWorkflowEngine.dataset_relative_path, **Xy)
        store = StoreFs(dirpath=os.path.join(tmp_work_dir_path, SomaWorkflowEngine.tree_root_relative_path))
        self.tree_root.save_tree(store=store)

        ## Subtree job allocation on disk
        ## ==============================
        node_input = NodesInput(self.tree_root.get_key())
        split_node_input = SplitNodesInput(self.tree_root, num_processes=self.num_processes)
        nodesinput_list = split_node_input.split(node_input)
        keysfile_list = save_job_list(tmp_work_dir_path, nodesinput_list)

        ## Build soma-workflow
        ## ===================
        jobs = self._create_jobs(keysfile_list, is_run_local, ft_working_directory)
        soma_workflow = Workflow(jobs=jobs)

        controller = WorkflowController(self.resource_id, self.login, self.pw)
        ## run soma-workflow
        ## =================
        wf_id = controller.submit_workflow(workflow=soma_workflow, name="epac workflow", queue=self.queue)
        Helper.transfer_input_files(wf_id, controller)
        Helper.wait_workflow(wf_id, controller)
        Helper.transfer_output_files(wf_id, controller)

        self.engine_info = self.get_engine_info(controller, wf_id)

        if self.remove_finished_wf:
            controller.delete_workflow(wf_id)
        ## read result tree
        ## ================
        self.tree_root = store.load()
        os.chdir(cur_work_dir)
        if os.path.isdir(tmp_work_dir_path) and self.remove_local_tree:
            shutil.rmtree(tmp_work_dir_path)
        return self.tree_root
Beispiel #32
0
            "--output_r2",
            os.path.join(args.output, 'r2'), "--output_distribution",
            os.path.join(args.output, 'distribution'), "--x", args.x, "--y",
            args.y, "--shuffling", shuffling, "--n_permutations",
            args.nb_permutations, "--alpha_percentile", args.alpha_percentile
        ],
                  name="job {} - alpha {}".format(run, alpha),
                  working_directory=scripts_path)
        group_significativity.append(job)
        jobs.append(job)

    distribution_voxels = Group(elements=group_significativity,
                                name="Voxel wise fitting of the models")

    workflow2 = Workflow(jobs=jobs, root_group=[distribution_voxels])

    ### Submit the workflow to computing resource (configured in the client-server mode)

    controller2 = WorkflowController(
        "DSV_cluster_ap259944", args.login,
        args.password)  #"DSV_cluster_ap259944", args.login, args.password

    workflow_id2 = controller2.submit_workflow(workflow=workflow2,
                                               name="Voxel-wise computations")

    # You may use the gui or manually transfer the files:
    manual = True
    if manual:
        Helper.transfer_input_files(workflow_id2, controller2)
        Helper.wait_workflow(workflow_id2, controller2)
        Helper.transfer_output_files(workflow_id2, controller2)