Example #1
0
    def __init__(self, params):
        self.params = params
        task_list = []

        print("GenerateVolumeImagesParallel: contacting tissumaps server")
        tmaps_api = TmClient(host=self.params.host,
                             port=80,
                             experiment_name=self.params.experiment,
                             username=self.params.username,
                             password=self.params.password)

        # find the site dimensions
        sites = tmaps_api.get_sites()
        print("found %d sites on tissuemaps", len(sites))
        for site in sites:
            if site['plate_name'] == self.params.plate_name:
                well_name = site['well_name']
                x = site['x']
                y = site['y']

                task_list.append(
                    GenerateVolumeImagesApp(
                        self.params.host, self.params.username,
                        self.params.password, self.params.experiment,
                        self.params.plate_name, well_name, x, y,
                        self.params.threshold, self.params.mean_size,
                        self.params.min_size, self.params.filter_type,
                        self.params.minimum_bead_intensity, self.params.z_step,
                        self.params.pixel_size, self.params.alpha,
                        self.params.smooth, self.params.input_path,
                        self.params.output_path, self.params.channel_string))
        ParallelTaskCollection.__init__(self, task_list, output_dir='')
Example #2
0
    def __init__(self, param_value,
                 input_file,
                 output_folder,
                 **extra):


        gc3libs.log.info("\t\t\tCalling InnerParallelIteration.init(%d,%s)" % (param_value,input_file))

        tasks = []

        self.jobname = "Gdemo_paral_"+str(param_value)
        extra_args = extra.copy()
        # XXX: do I need this ?
        extra_args['parent'] = self.jobname
        tasks.append(
            InnerSequentialIterationA(
                param_value,
                input_file,
                output_folder,
                iteration=0,
                **extra_args
                )
            )
        tasks.append(
            InnerSequentialIterationB(
                param_value,
                input_file,
                output_folder,
                iteration=0,
                **extra_args
                )
            )

        # actually init jobs
        ParallelTaskCollection.__init__(self, tasks, **extra)
Example #3
0
 def __init__(self, name, experiment_id, verbosity, submission_id, user_name,
              parent_id, description=None):
     '''
     Parameters
     ----------
     name: str
         name of the stage
     experiment_id: int
         ID of the processed experiment
     verbosity: int
         logging verbosity index
     submission_id: int
         ID of the corresponding submission
     user_name: str
         name of the submitting user
     parent_id: int
         ID of the parent
         :class:`Workflow <tmlib.workflow.workflow.Workflow>`
     description: tmlib.tmaps.description.WorkflowStageDescription, optional
         description of the stage (default: ``None``)
     '''
     ParallelTaskCollection.__init__(
         self, tasks=None, jobname='%s' % name
     )
     WorkflowStage.__init__(
         self, name=name, experiment_id=experiment_id, verbosity=verbosity,
         submission_id=submission_id, user_name=user_name,
         parent_id=parent_id, description=description
     )
    def __init__(self, **kwargs):

        config = kwargs["config"]
        self.c = config["sequencewise_parallel_flow"]

        # TODO: Find all files in dir and create self.lSeq! Warning! Should be done once the
        # Tasks before are finished.
        #self.lSeq = [re.findall(self.c['retag'], i)[0] for i in os.listdir(self.c['input'])]
        self.lSeq = [
            i for i in os.listdir(
                self.c['input']) if not i.endswith("fai")]
        self.kwargs = kwargs

        gc3libs.log.info(
            "\t\tCalling SequencewiseParallelFlow.__init({})".format(
                self.kwargs))

        self.tasks = [
            AnnotateTandemRepeats(
                name="annotate_tandem_repeats",
                param={
                    "$N": iSeq},
                **kwargs) for iSeq in self.lSeq]

        ParallelTaskCollection.__init__(self, self.tasks, **kwargs)
Example #5
0
    def __init__(self, grayscaled_image, copies, ncolors, output_dir,
                 **extra_args):
        gc3libs.log.info("TricolorizeMultipleImages for %d copies run" %
                         copies)
        self.jobname = "Warholizer_Parallel"
        self.ncolors = ncolors
        ### XXX Why I have to use basename???
        self.output_dir = os.path.join(output_dir, 'tricolorize')
        self.warhol_dir = output_dir

        # Compute a unique sequence of random combination of
        # colors. Please note that we can have a maximum of N!/3! if N
        # is len(colors)
        assert copies <= math.factorial(len(
            self.colors)) / math.factorial(ncolors)

        combinations = [
            i for i in itertools.combinations(self.colors, ncolors)
        ]
        combinations = random.sample(combinations, copies)

        # Create all the single tasks
        self.tasks = []
        for i, colors in enumerate(combinations):
            self.tasks.append(
                TricolorizeImage(os.path.relpath(grayscaled_image),
                                 "%s.%d" % (self.output_dir, i),
                                 "%s.%d" % (grayscaled_image, i), colors,
                                 self.warhol_dir, **extra_args))

        ParallelTaskCollection.__init__(self, self.tasks)
    def __init__(self, params):
        self.params = params
        task_list = []

        tmaps_api = TmClient(
            host=self.params.host,
            port=80,
            experiment_name=self.params.experiment,
            username=self.params.username,
            password=self.params.password
        )

        # find the site dimensions
        sites = tmaps_api.get_sites()
        for site in sites:
            if site['plate_name'] == self.params.plate:
                well_name = site['well_name']
                x = site['x']
                y = site['y']

                task_list.append(
                    GenerateVolumeImagesApp(
                        self.params.host, self.params.username,
                        self.params.password, self.params.experiment,
                        self.params.plate, well_name, x, y,
                        self.params.input_path,
                        self.params.output_path,
                        self.params.fname_stem
                    )
                )
        ParallelTaskCollection.__init__(self, task_list, output_dir='')
Example #7
0
    def __init__(self, param_value, input_file, output_folder, **extra):

        gc3libs.log.info("\t\t\tCalling InnerParallelIteration.init(%d,%s)" %
                         (param_value, input_file))

        tasks = []

        self.jobname = "Gdemo_paral_" + str(param_value)
        extra_args = extra.copy()
        # XXX: do I need this ?
        extra_args['parent'] = self.jobname
        tasks.append(
            InnerSequentialIterationA(param_value,
                                      input_file,
                                      output_folder,
                                      iteration=0,
                                      **extra_args))
        tasks.append(
            InnerSequentialIterationB(param_value,
                                      input_file,
                                      output_folder,
                                      iteration=0,
                                      **extra_args))

        # actually init jobs
        ParallelTaskCollection.__init__(self, tasks, **extra)
Example #8
0
 def __init__(self, name, experiment_id, verbosity, submission_id, user_name,
              parent_id, description=None):
     '''
     Parameters
     ----------
     name: str
         name of the stage
     experiment_id: int
         ID of the processed experiment
     verbosity: int
         logging verbosity index
     submission_id: int
         ID of the corresponding submission
     user_name: str
         name of the submitting user
     parent_id: int
         ID of the parent
         :class:`Workflow <tmlib.workflow.workflow.Workflow>`
     description: tmlib.tmaps.description.WorkflowStageDescription, optional
         description of the stage (default: ``None``)
     '''
     ParallelTaskCollection.__init__(
         self, tasks=None, jobname='%s' % name
     )
     WorkflowStage.__init__(
         self, name=name, experiment_id=experiment_id, verbosity=verbosity,
         submission_id=submission_id, user_name=user_name,
         parent_id=parent_id, description=description
     )
Example #9
0
    def __init__(self, grayscaled_image, copies, ncolors, output_dir, **extra_args):
        gc3libs.log.info(
            "TricolorizeMultipleImages for %d copies run" % copies)
        self.jobname = "Warholizer_Parallel"
        self.ncolors = ncolors
        ### XXX Why I have to use basename???
        self.output_dir = os.path.join(
            output_dir, 'tricolorize')
        self.warhol_dir = output_dir

        # Compute a unique sequence of random combination of
        # colors. Please note that we can have a maximum of N!/3! if N
        # is len(colors)
        assert copies <= math.factorial(len(self.colors)) / math.factorial(ncolors)

        combinations = [i for i in itertools.combinations(self.colors, ncolors)]
        combinations = random.sample(combinations, copies)

        # Create all the single tasks
        self.tasks = []
        for i, colors in enumerate(combinations):
            self.tasks.append(TricolorizeImage(
                os.path.relpath(grayscaled_image),
                "%s.%d" % (self.output_dir, i),
                "%s.%d" % (grayscaled_image, i),
                colors,
                self.warhol_dir, **extra_args))

        ParallelTaskCollection.__init__(self, self.tasks)
    def __init__(self, jokes, **kwargs):

        self.jokes = jokes
        gc3libs.log.info("\t\tCalling MainParallelFlow.__init({})".format(self.jokes))

        self.tasks = [InnerSequentialFlow(joke) for joke in self.jokes]

        ParallelTaskCollection.__init__(self, self.tasks, **kwargs)
Example #11
0
 def __init__(self, img, N):
     apps = []
     for n in range(N):
         col1 = random_color()
         col2 = random_color()
         col3 = random_color()
         output_dir = ("colorized-{name}-{nr}.d".format(name=basename(img), nr=n))
         apps.append(ColorizeApp(img, col1, col2, col3, output_dir))
     ParallelTaskCollection.__init__(self, apps)
        def __init__(self, executable, abc_executable, inputfilelist_abc, output_folder, **extra_args):

                parallel_task = []
                for input_file in inputfilelist_abc:
                        name = "ABC_execution_" + os.path.basename(input_file)

                        parallel_task.append(ABC_Application(executable, abc_executable, input_file, output_folder, **extra_args))

                ParallelTaskCollection.__init__(self, name, parallel_task)
Example #13
0
 def __init__(self, img, N):
     apps = []
     for n in range(N):
         col1 = random_color()
         col2 = random_color()
         col3 = random_color()
         output_dir = ("colorized-{name}-{nr}.d".format(name=basename(img),
                                                        nr=n))
         apps.append(ColorizeApp(img, col1, col2, col3, output_dir))
     ParallelTaskCollection.__init__(self, apps)
Example #14
0
    def __init__(self, executable, abc_executable, inputfilelist_abc,
                 output_folder, **extra_args):

        parallel_task = []
        for input_file in inputfilelist_abc:
            name = "ABC_execution_" + os.path.basename(input_file)

            parallel_task.append(
                ABC_Application(executable, abc_executable, input_file,
                                output_folder, **extra_args))

        ParallelTaskCollection.__init__(self, name, parallel_task)
Example #15
0
    def __init__(self, param_value, input_file_folder, output_folder, **extra):

        self.jobname = "Gdemo_MainParal_" + str(param_value)

        gc3libs.log.info("\t\tCalling MainParallelIteration.__init(%d,%s)" %
                         (param_value, input_file_folder))

        self.tasks = []
        for input_file in os.listdir(input_file_folder):
            self.tasks.append(
                InnerParallelIteration(param_value,
                                       os.path.abspath(input_file),
                                       output_folder))
        ParallelTaskCollection.__init__(self, self.tasks, **extra)
    def __init__(self, params):
        self.params= params
        task_list = []

        for experiment, input_path, output_path, fname_stem in itertools.izip(params.experiment, params.input_path, params.output_path, params.fname_stem):
            print(experiment, input_path, output_path, fname_stem)
            self.params.experiment = experiment
            self.params.input_path = input_path
            self.params.output_path = output_path
            self.params.fname_stem = fname_stem
            task_list.append(
                GenerateVolumeImagesParallel(self.params)
            )

        ParallelTaskCollection.__init__(self, task_list, output_dir='')
    def __init__(self, directory, pattern, task_ctor, **extra_args):
        tasks = [ ]
        for filename in os.listdir(directory):
            if not fnmatch.fnmatch(filename, pattern):
                continue
            pathname = os.path.join(directory, filename)
            tasks.append(task_ctor(pathname, **extra_args))

        ParallelTaskCollection.__init__(
            self,
            # job name
            make_identifier("Process %s files in directory %s" % (pattern, directory)),
            # list of tasks to execute
            tasks,
            # boilerplate
            **extra_args)
Example #18
0
    def __init__(self, xVars, paraCombos, substs, optimFolder, solverParas, **sessionParas):

        logger.debug('entering idRiskParaSearchParallel.__init__')
        # create jobname
        self.jobname = 'evalSolverGuess' + '_' + sessionParas['jobname']
        for paraCombo in paraCombos:
            self.jobname += str(paraCombo)

        self.substs       = substs
        self.optimFolder  = optimFolder
        self.solverParas  = solverParas
        self.sessionParas = sessionParas
        tasks = self.generateTaskList(xVars, paraCombos, substs, sessionParas)
        ParallelTaskCollection.__init__(self, self.jobname, tasks)

        logger.debug('done idRiskParaSearchParallel.__init__')
Example #19
0
    def new_tasks(self, extra):
        appextra = extra.copy()
        del appextra['output_dir']

        if self.params.parallel:
            task = ParallelTaskCollection([
                GRunApplication(self.params.args,
                                jobname='GRunApplication.%d' % i,
                                output_dir='GRunApplication.%d.d' % i,
                                **appextra)
                for i in range(self.params.parallel)
            ], **extra)

        elif self.params.sequential:
            task = SequentialTaskCollection([
                GRunApplication(self.params.args,
                                jobname='GRunApplication.%d' % i,
                                output_dir='GRunApplication.%d.d' % i,
                                **appextra)
                for i in range(self.params.sequential)
            ], **extra)

        else:
            task = GRunApplication(self.params.args, **extra)

        return [task]
Example #20
0
    def __init__(self, xVars, paraCombos, substs, optimFolder, solverParas, **sessionParas):

        logger.debug("entering idRiskParaSearchParallel.__init__")
        # create jobname
        self.jobname = "evalSolverGuess" + "_" + sessionParas["jobname"]
        for paraCombo in paraCombos:
            self.jobname += str(paraCombo)

        self.substs = substs
        self.optimFolder = optimFolder
        self.solverParas = solverParas
        self.sessionParas = sessionParas
        tasks = self.generateTaskList(xVars, paraCombos, substs, sessionParas)
        ParallelTaskCollection.__init__(self, self.jobname, tasks)

        logger.debug("done idRiskParaSearchParallel.__init__")
Example #21
0
    def stage0(self):
        """
        Chunk input table and run chunks in parallel
        """
        tasks = []
        for (input_file, index_chunk) in generate_chunked_files_and_list(
                self.input_table_file, self.chunk_size):
            jobname = "gbugs-%s" % (str(index_chunk))
            extra_args = self.extra.copy()
            extra_args['index_chunk'] = str(index_chunk)
            extra_args['jobname'] = jobname

            # extra_args['output_dir'] = self.params.output
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'NAME', jobname)
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'SESSION', jobname)
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'DATE', jobname)
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'TIME', jobname)

            if self.driver_script:
                extra_args['driver_script'] = self.driver_script

            gc3libs.log.debug("Creating Task for index : %d - %d" %
                              (index_chunk, (index_chunk + self.chunk_size)))

            tasks.append(GBugsApplication(input_file, **extra_args))
        return ParallelTaskCollection(tasks)
Example #22
0
    def new_tasks(self, extra):
        """
		For each line of the input .csv file generate
		an execution Task
		"""
        tasks = []
        l = 0
        for parameter in self._enumerate_csv(self.params.csv_input_file):
            parameter_str = '.'.join(str(x) for x in parameter)
            parlength = len(parameter)
            if not parlength == 11:
                raise gc3libs.exceptions.InvalidUsage(
                    "Parameter length not correct")
            l = l + 1
            run = l
            jobname = "run%s" % str(l)
            extra_args = extra.copy()
            extra_args['jobname'] = jobname

            #Everything in results folder on remote computer
            extra_args['output_dir'] = CLOUDNAME  #Not working
            #extra_args['output_dir'] = extra_args['output_dir'].replace('NAME', DEFAULT_REMOTE_OUTPUT_FOLDER) #save on local machine#
            extra_args['output_dir'] = "%s%s" % (extra_args['output_dir'],
                                                 jobname)

            tasks.append(
                MatlabApp(self.params.matlab_function, parameter,
                          self.params.matlab_source_folder, run, **extra_args))
        return [ParallelTaskCollection(tasks, **extra)]
 def __init__(self, directory, basename, pattern, **extra_args):
     self.directory = directory
     self.basename = basename
     self.pattern = basename + pattern
     self.extra_args = extra_args
     ParallelTaskCollection.__init__(
         self,
         # jobname
         make_identifier("Stage 0 of Swath workflow in directory %s processing files %s" % (directory, pattern)),
         # tasks
         [
             ProcessFilesInParallel(directory, pattern, ChromaExtractLong, **extra_args),
             ChromaExtractShortPlusNormalization(directory, pattern, **extra_args),
         ],
         # boilerplate
         **extra_args)
Example #24
0
    def __init__(self, xVars, paraCombos, substs, optimFolder, solverParas, **sessionParas):

        logger.debug('entering idRiskParaSearchParallel.__init__')
        # create jobname
        self.jobname = 'evalSolverGuess' + '_' + sessionParas['jobname']
        for paraCombo in paraCombos:
            self.jobname += str(paraCombo)
            
        self.substs       = substs
        self.optimFolder  = optimFolder
        self.solverParas  = solverParas
        self.sessionParas = sessionParas
        forwardPremium.paraLoop_fp.__init__(self, verbosity = 'INFO')
        tasks = self.generateTaskList(xVars, paraCombos, substs, sessionParas)
        ParallelTaskCollection.__init__(self, self.jobname, tasks)

        logger.debug('done idRiskParaSearchParallel.__init__')
Example #25
0
    def __init__(self, directory, pattern, task_ctor, **extra_args):
        tasks = []
        for filename in os.listdir(directory):
            if not fnmatch.fnmatch(filename, pattern):
                continue
            pathname = os.path.join(directory, filename)
            tasks.append(task_ctor(pathname, **extra_args))

        ParallelTaskCollection.__init__(
            self,
            # job name
            make_identifier("Process %s files in directory %s" %
                            (pattern, directory)),
            # list of tasks to execute
            tasks,
            # boilerplate
            **extra_args)
 def new_tasks(self, extra):
     fold_name = [os.path.basename(path) for path in self.params.input_dirs]
     apps = []
     for image in fold_name:
         output_dir = ("colorized-{name}.d".format(name=basename(image)))
         apps.append(GRunApplication(image, output_dir))
     task = ParallelTaskCollection(apps)
     return [task]
Example #27
0
    def stage1(self):
        """
        Run a RICC2 job for each valid CBAS/CABS basis combination,
        re-using the results from RIDFT in `stage0`.

        If RIDFT failed, exit immediately.
        """
        # terminate if first stage was unsuccessful
        rc = self.tasks[0].execution.returncode
        if rc is not None and rc != 0:
            return rc
        # else, proceeed with 2nd pass
        pass2 = [ ]
        ridft_coord = os.path.join(self.tasks[0].turbomole_output_dir, 'coord')
        for ricc2_in in self.ricc2_ins:
            cbas = ricc2_in._keywords['CBAS_BASIS']
            cabs = ricc2_in._keywords['CABS_BASIS']
            ricc2_dir = os.path.join(self.work_dir,
                                     'cbas-%s/cabs-%s/ricc2' % (cbas, cabs))
            gc3libs.utils.mkdir(ricc2_dir)
            gc3libs.utils.copyfile(ridft_coord, ricc2_dir)
            ricc2_define_in = _make_define_in(ricc2_dir, ricc2_in)
            ricc2_output_dir = os.path.join(ricc2_dir, 'output')
            # guess duration of the RICC2 job
            extra = self.extra.copy()
            if ('aug-cc-pV5Z' == self.orb_basis
                or 'aug-cc-pV5Z' == self.rijk_basis
                or 'aug-cc-pV5Z' == cbas
                or 'aug-cc-pV5Z' == cabs):
                extra.setdefault('requested_walltime', 4*hours)
            else:
                extra.setdefault('requested_walltime', 1*hours)
            pass2.append(
                TurbomoleAndXmlProcessingPass(
                    # job name
                    ('ricc2-%s-%s-%s' % (self.name, cbas, cabs)),
                    # TURBOMOLE application to run
                    NonLocalTurbomoleDefineApplication(
                        'ricc2', ricc2_define_in,
                        # the second pass builds on files defined in the first one
                        os.path.join(ricc2_dir, 'coord'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'control'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'energy'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'mos'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'basis'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'auxbasis'),
                        output_dir = ricc2_output_dir,
                        stdout = 'ricc2.out',
                        **extra),
                    os.path.join(ricc2_output_dir, 'xml-processing'),
                    # DB parameters
                    # FIXME: make these settable on the command-line
                    db_dir='/db/home/fox/gricomp', db_user='******', db_pass='******',
                    # TaskCollection required params
                    **self.extra))
            gc3libs.log.debug("Created RICC2 task in directory '%s'", ricc2_dir)
        return (ParallelTaskCollection(self.name + '.pass2', pass2))
Example #28
0
    def __init__(self, param_value,
                 input_file_folder,
                 output_folder, **extra):

        self.jobname = "Gdemo_MainParal_"+str(param_value)

        gc3libs.log.info("\t\tCalling MainParallelIteration.__init(%d,%s)" % (param_value,input_file_folder))

        self.tasks = []
        for input_file in os.listdir(input_file_folder):
            self.tasks.append(
                InnerParallelIteration(
                    param_value,
                    os.path.abspath(input_file),
                    output_folder
                    )
                )
        ParallelTaskCollection.__init__(self, self.tasks, **extra)
Example #29
0
    def __init__(self,
                 title,
                 coord,
                 bases,
                 jkbases,
                 cbases,
                 cabses,
                 work_dir,
                 valid1=acceptable_ridft_basis_set,
                 valid2=acceptable_ricc2_basis_set,
                 **extra_args):
        """
        Create a new tasks that runs several analyses in parallel, one
        for each accepted combination of orbital and RIJK basis.
        """
        extra_args.setdefault('memory',
                              2000)  # XXX: check with `requested_memory`

        ridft_define_in = Template(RIDFT_DEFINE_IN,
                                   valid1,
                                   TITLE=title,
                                   ORB_BASIS=bases,
                                   RIJK_BASIS=jkbases,
                                   RIDFT_MEMORY=[extra_args['memory']
                                                 ])  # end of RIDFT template

        ricc2_define_in = Template(
            RICC2_DEFINE_IN,
            valid2,
            # the ORB_BASIS will be derived from the RIDFT_DEFINE_IN template
            CBAS_BASIS=cbases,
            CABS_BASIS=cabses,
            RICC2_MEMORY=[extra_args['memory']],
        )  # end of RICC2 template

        tasks = []
        for ridft in expansions(ridft_define_in):
            orb_basis = ridft._keywords['ORB_BASIS']
            tasks.append(
                BasisSweepPasses(
                    title + '.seq', coord, ridft,
                    list(expansions(ricc2_define_in, ORB_BASIS=orb_basis)),
                    work_dir, **extra_args))
        ParallelTaskCollection.__init__(self, title, tasks)
Example #30
0
    def __init__(self, pop, jobname, iteration, path_to_stage_dir,
                 cur_pop_file, task_constructor, **extra_args):

        gc3libs.log.debug('entering ComputeTargetVals.__init__')

        # Set up initial variables and set the correct methods.
        self.jobname = jobname + '-' + \
            'compute_target_vals' + '-' + str(iteration)
        self.iteration = iteration

        self.path_to_stage_dir = path_to_stage_dir
        # ComputeTargetVals produces no output.
        # But attribute needs to be specified.
        self.output_dir = path_to_stage_dir
        self.cur_pop_file = cur_pop_file
        self.verbosity = 'DEBUG'
        self.extra_args = extra_args

        # Log activity
        cDate = datetime.date.today()
        cTime = datetime.datetime.time(datetime.datetime.now())
        date_string = '%04d--%02d--%02d--%02d--%02d--%02d' % (
            cDate.year, cDate.month, cDate.day, cTime.hour, cTime.minute,
            cTime.second)
        gc3libs.log.debug('Establishing parallel task on %s', date_string)

        # Enter an iteration specific folder
        self.iteration_folder = os.path.join(
            self.path_to_stage_dir, 'Iteration-' + str(self.iteration))
        try:
            os.mkdir(self.iteration_folder)
        except OSError:
            print '%s already exists' % self.iteration_folder

        # save pop to file
        if cur_pop_file:
            np.savetxt(os.path.join(self.iteration_folder, cur_pop_file),
                       pop,
                       delimiter=' ')

        self.tasks = [
            task_constructor(pop_mem, self.iteration_folder) for pop_mem in pop
        ]
        ParallelTaskCollection.__init__(self, self.tasks, **extra_args)
Example #31
0
    def __init__(self, pop, jobname, iteration, path_to_stage_dir,
                 cur_pop_file, task_constructor, **extra_args):

        gc3libs.log.debug('entering ComputeTargetVals.__init__')

        # Set up initial variables and set the correct methods.
        self.jobname = jobname + '-' + \
            'compute_target_vals' + '-' + str(iteration)
        self.iteration = iteration

        self.path_to_stage_dir = path_to_stage_dir
        # ComputeTargetVals produces no output.
        # But attribute needs to be specified.
        self.output_dir = path_to_stage_dir
        self.cur_pop_file = cur_pop_file
        self.verbosity = 'DEBUG'
        self.extra_args = extra_args

        # Log activity
        cDate = datetime.date.today()
        cTime = datetime.datetime.time(datetime.datetime.now())
        date_string = '%04d--%02d--%02d--%02d--%02d--%02d' % (
            cDate.year, cDate.month, cDate.day, cTime.hour, cTime.minute, cTime.second)
        gc3libs.log.debug('Establishing parallel task on %s', date_string)

        # Enter an iteration specific folder
        self.iteration_folder = os.path.join(
            self.path_to_stage_dir, 'Iteration-' + str(self.iteration))
        try:
            os.mkdir(self.iteration_folder)
        except OSError:
            print '%s already exists' % self.iteration_folder

        # save pop to file
        if cur_pop_file:
            np.savetxt(os.path.join(self.iteration_folder, cur_pop_file),
                       pop, delimiter=' ')

        self.tasks = [
            task_constructor(pop_mem, self.iteration_folder) for pop_mem in pop
        ]
        ParallelTaskCollection.__init__(self, self.tasks, **extra_args)
Example #32
0
 def __init__(self, directory, basename, pattern, **extra_args):
     self.directory = directory
     self.basename = basename
     self.pattern = basename + pattern
     self.extra_args = extra_args
     ParallelTaskCollection.__init__(
         self,
         # jobname
         make_identifier(
             "Stage 0 of Swath workflow in directory %s processing files %s"
             % (directory, pattern)),
         # tasks
         [
             ProcessFilesInParallel(directory, pattern, ChromaExtractLong,
                                    **extra_args),
             ChromaExtractShortPlusNormalization(directory, pattern, **
                                                 extra_args),
         ],
         # boilerplate
         **extra_args)
Example #33
0
    def __init__(self, tests=None, **extra):
        """
        `tests` is a list of subdirectories which must match the
        `RunTestsInParallel` dictionary
        """
        if not tests:
            tests = self.applicationdirs
        else:
            tests = dict((k, v) for k, v in self.applicationdirs.iteritems()
                         if k in tests)
        tasks = []
        extra['output_dir'] = "RunTestAppsInParallel"
        for testdir, classes in tests.iteritems():
            appdir = os.path.abspath(testdir)

            tasks += [
                cls(appdir, **extra) for cls in classes
                if issubclass(cls, Task) and issubclass(cls, TestRunner)]
        if not tasks:
            raise RuntimeError("No tasks found")
        ParallelTaskCollection.__init__(self, tasks, **extra)
Example #34
0
    def __init__(self, **kwargs):

        config = kwargs["config"]
        self.c = config["sequencewise_parallel_flow"]

        # TODO: Find all files in dir and create self.lSeq! Warning! Should be done once the
        # Tasks before are finished.
        #self.lSeq = [re.findall(self.c['retag'], i)[0] for i in os.listdir(self.c['input'])]
        self.lSeq = [
            i for i in os.listdir(self.c['input']) if not i.endswith("fai")
        ]
        self.kwargs = kwargs

        gc3libs.log.info(
            "\t\tCalling SequencewiseParallelFlow.__init({})".format(
                self.kwargs))

        self.tasks = [
            AnnotateTandemRepeats(name="annotate_tandem_repeats",
                                  param={"$N": iSeq},
                                  **kwargs) for iSeq in self.lSeq
        ]

        ParallelTaskCollection.__init__(self, self.tasks, **kwargs)
Example #35
0
    def __init__(self, title, coord, bases, jkbases, cbases, cabses, work_dir,
                 valid1=acceptable_ridft_basis_set,
                 valid2=acceptable_ricc2_basis_set,
                 **extra_args):
        """
        Create a new tasks that runs several analyses in parallel, one
        for each accepted combination of orbital and RIJK basis.
        """
        extra_args.setdefault('memory', 2000) # XXX: check with `requested_memory`

        ridft_define_in = Template(
            RIDFT_DEFINE_IN, valid1,
            TITLE=title,
            ORB_BASIS=bases,
            RIJK_BASIS=jkbases,
            RIDFT_MEMORY = [extra_args['memory']]
            ) # end of RIDFT template

        ricc2_define_in = Template(
            RICC2_DEFINE_IN, valid2,
            # the ORB_BASIS will be derived from the RIDFT_DEFINE_IN template
            CBAS_BASIS=cbases,
            CABS_BASIS=cabses,
            RICC2_MEMORY = [extra_args['memory']],
            ) # end of RICC2 template

        tasks = [ ]
        for ridft in expansions(ridft_define_in):
            orb_basis = ridft._keywords['ORB_BASIS']
            tasks.append(
                BasisSweepPasses(
                    title + '.seq', coord, ridft,
                    list(expansions(ricc2_define_in,
                                    ORB_BASIS=orb_basis)),
                    work_dir, **extra_args))
        ParallelTaskCollection.__init__(self, title, tasks)
Example #36
0
    def stage0(self):
        """
        Stage0: for each sample run GATK pipeline steps 1,2,3
        * 1 sample takes 24-72 hours on single core
        * GATK can be scripted to run individual steps
        * Output: 2 files per sample (g.vcf and g.vcf.idx size 1GB total)
        # 300 samples - see if we can allocate 150 cores for 2 days
        # 1 day each
        Example script:
java -jar -d64 ~/programs/GenomeAnalysisTK.jar\
     -T HaplotypeCaller\
     --emitRefConfidence GVCF\
     -minPruning 3 -stand_call_conf 30 \
     -stand_emit_conf 10 \
     -R ~/goat.genome/goat_scaffoldFG_V1.1.normalised.22.07.fa -I \
        $file -o ${samplename}.g.vcf
        """

        tasks = []

        for (bam_file,bai_file) in get_bams(self.input_bam_folder):
            extra_args = self.extra.copy()
            extra_args['sample_name'] = os.path.basename(bam_file).split('.')[0]
            extra_args['bam_filename'] = os.path.basename(bam_file)
            extra_args['bai_filename'] = os.path.basename(bai_file)
            extra_args['jobname'] = "gatk-s0-%s" % extra_args['bam_filename']

            extra_args['output_dir'] = extra_args['output_dir'].replace('NAME',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('SESSION',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('DATE',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('TIME',
                                                                        extra_args['jobname'])

            gc3libs.log.debug("Creating Stage0 task for : %s" %
                              (extra_args['bam_filename']))

            tasks.append(GATKS0Application(
                bam_file,
                bai_file,
                **extra_args))

        return ParallelTaskCollection(tasks)
Example #37
0
 def new_tasks(self, extra):
     extra
     if self.params.size:
         extra['size'] = self.params.size
     gc3libs.log.info("Creating main sequential task")
     tasks = []
     for (i, input_file) in enumerate(self.params.args):
         if not os.path.isfile(input_file):
             gc3libs.log.error("Argument `%s` is NOT a file. Ignoring" %
                               input_file)
             continue
         extra_args = extra.copy()
         extra_args['output_dir'] = 'Warholized.%s' % os.path.basename(
             input_file)
         tasks.append(
             WarholizeWorkflow(input_file, self.params.copies,
                               self.params.num_colors, **extra_args))
     if not tasks:
         raise gc3libs.exceptions.InvalidUsage(
             "Missing or invalid image file.")
     return [ParallelTaskCollection(tasks, **extra)]
Example #38
0
    def stage1(self):
        """
        Step 1: For each available statistical method, run independent application
        """
        tasks = []

        for method in STATS:
            extra_args = self.extra.copy()
            extra_args['jobname'] = method
            extra_args['results'] = self.s1_outputfolder
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'NAME', extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'SESSION', extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'DATE', extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'TIME', extra_args['jobname'])

            tasks.append(
                GenREMDatasetApplication(method, [self.s0_outputfolder],
                                         self.source_folder, **extra_args))
        return ParallelTaskCollection(tasks)
Example #39
0
 def new_tasks(self, extra):
     if self.params.size:
         extra['size'] = self.params.size
     tasks = []
     for (i, input_file) in enumerate(self.params.args):
         if not os.path.isfile(input_file):
             gc3libs.log.error("Argument `%s` is NOT a file. Ignoring",
                               input_file)
             continue
         gc3libs.log.info(
             "Creating sequential task for processing file `%s`",
             input_file)
         extra_args = extra.copy()
         extra_args['output_dir'] = os.path.join(
             extra_args.get('output_dir', os.getcwd()),
             'Warholized.' + os.path.basename(input_file)).replace(
                 '/NAME/', '/')  ## yes, it's a bug
         tasks.append(
             WarholizeWorkflow(input_file, self.params.copies,
                               self.params.num_colors, **extra_args))
     if not tasks:
         raise gc3libs.exceptions.InvalidUsage(
             "Missing or invalid image file.")
     return [ParallelTaskCollection(tasks, **extra)]
Example #40
0
    def __init__(self, inParaCombos, iteration, pathToExecutable, pathToStageDir, architecture, baseDir, xVars,
                 solverVerb, problemType, analyzeResults, ctryList, **extra_args):
        
        '''
          Generate a list of tasks and initialize a ParallelTaskCollection with them. 
          Uses paraLoop class to generate a list of (descriptions, substitutions for the input files). Descriptions are generated from
          variable names that are hard coded in this method right now. 
          Uses method generateTaskList to create a list of GPremiumApplication's which are invoked from a list of inputs (appropriately adjusted input files), 
          the output directory and some further settings for each run. 
          
          inParaCombos:      List of tuples defining the parameter combinations.
          iteration:         Current iteration number. 
          pathToExecutable:  Path to the executable (the external program to be called). 
          pathToStageDir:    Root path. Usually os.getcwd()
          architecture:      32 or 64 bit.
          baseDir:           Directory in which the input files are located. 
          xVars:             Names of the x variables. 
          solverVerb:        Logger verbosity. 
          problemType:       Forward premium specific flag to determine which case to look at. 
          analyzeResults:    Function to use to analyze the emerging output. 
          ctryList:          Forward premium specific list of ctrys to look at. 
        '''

        logger.debug('entering gParaSearchParalell.__init__')


        # Set up initial variables and set the correct methods.
        self.pathToStageDir = pathToStageDir
        self.problemType = problemType
        self.executable = pathToExecutable
        self.architecture = architecture
        self.baseDir = baseDir
        self.verbosity = solverVerb.upper()
        self.xVars = xVars
        self.n = len(self.xVars.split())
        self.analyzeResults = analyzeResults
        self.ctryList = ctryList
        self.iteration = iteration
        self.jobname = 'evalSolverGuess' + '-' + extra_args['jobname'] + '-' + str(self.iteration)
        self.extra_args = extra_args
        tasks = []

        # --- createJobs_x ---

        # Log activity
        cDate = datetime.date.today()
        cTime = datetime.datetime.time(datetime.datetime.now())
        dateString = '{0:04d}-{1:02d}-{2:02d}-{3:02d}-{4:02d}-{5:02d}'.format(cDate.year, cDate.month, cDate.day, cTime.hour, cTime.minute, cTime.second)
        logger.debug('Establishing parallel task on %s' % dateString)

        # Enter an iteration specific folder
        self.iterationFolder = os.path.join(self.pathToStageDir, 'Iteration-' + str(self.iteration))
        try:
            os.mkdir(self.iterationFolder)
        except OSError:
            print '%s already exists' % self.iterationFolder

        # save population to file
        np.savetxt(os.path.join(self.iterationFolder, 'curPopulation'), inParaCombos, delimiter = '  ')

        # Take the list of parameter combinations and translate them in a comma separated list of values for each variable to be fed into paraLoop file.
        # This can be done much more elegantly with ','.join() but it works...
        vals = []
        nVariables = range(len(inParaCombos[0]))
        for ixVar in nVariables:
            varValString = ''
            for ixParaCombo, paraCombo in enumerate(inParaCombos):
                ### Should make more precise string conversion.
                varValString += str(paraCombo[ixVar])
                if ixParaCombo < len(inParaCombos) - 1:
                    varValString += ', '
            vals.append( varValString )


        # Make problem specific adjustments to the paraLoop file.
        if self.problemType == 'one4all':
            print 'one4all'
            variables = ['Ctry', 'Ctry', 'EA', 'EB', 'sigmaA', 'sigmaB']
            groups    = [ 0, 0, 1, 1, 1, 1 ]
            groupRestrs = [ 'lowerTr', 'lowerTr', 'diagnol', 'diagnol', 'diagnol', 'diagnol' ]
            writeVals = [ ", ".join(self.ctryList), ", ".join(self.ctryList), vals[0], vals[0], vals[1], vals[1] ]
            self.variables = ['EA','sigmaA']
            self.paraCombos = inParaCombos
            paraFiles = [ 'input/markovA.in', 'input/markovB.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ]
            paraFileRegex = [ 'space-separated', 'space-separated', 'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated'  ]
            self.analyzeResults.tablePath = self.iterationFolder

        elif self.problemType == 'one4eachPair':
            print 'one4eachPair'
            # Check if EA or sigmaA are alone in the specified parameters. If so make diagnol adjustments
            writeVals = []
            if 'EA' in self.xVars and not 'EB' in self.xVars:
                variables = [ 'EA', 'EB' ]
                groups = [ '0', '0' ]
                groupRestrs = [ 'diagnol', 'diagnol' ]

                writeVals.append(vals[0])
                writeVals.append(vals[0])
                paraCombosEA = [  np.append(ele[0], ele[0]) for ele in inParaCombos ]
            if 'sigmaA' in self.xVars and not 'sigmaB' in self.xVars:
                variables.append( 'sigmaA')
                variables.append('sigmaB')
                groups.append( '0')
                groups.append('0')
                groupRestrs.append( 'diagnol')
                groupRestrs.append( 'diagnol' )
                writeVals.append(vals[1])
                writeVals.append(vals[1])
                paraCombosSigmaA = [  np.append(ele[1], ele[1]) for ele in inParaCombos ]

            # match ctry with val
            ctryVals = {}
            for ixCtry, ctry in enumerate(ctryList):
                ctryVals[ctry] = vals

            self.variables = variables

            # Prepare paraCombos matching to resulting table. Used in analyzeOverviewTable
            # !!! This should be dependent on problem type or on missing variables in xvars. !!!
            paraCombos = []
            for EA,sA in zip(paraCombosEA, paraCombosSigmaA):
                paraCombo = np.append(EA, sA)
                paraCombos.append(paraCombo)
            self.paraCombos = paraCombos
            paraFiles = [ 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ]
            paraFileRegex = [  'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated'  ]

        elif self.problemType == 'one4eachCtry':
            print 'one4eachCtry'

            ctry1List  = []
            ctry2List  = []
            EAList     = []
            EBList     = []
            sigmaAList = []
            sigmaBList = []
            self.paraCombos = []

            ctryIndices = getIndex([len(ctryList), len(ctryList)], 'lowerTr')
            for ixCombo in range(len(inParaCombos)):
                ctry1ListCombo = []
                ctry2ListCombo = []
                EAListCombo    = []
                EBListCombo    = []
                sigmaAListCombo = []
                sigmaBListCombo = []
                for ctryIndex in ctryIndices:
                    ctry1ListCombo.append(ctryList[ctryIndex[0]])
                    ctry2ListCombo.append(ctryList[ctryIndex[1]])
                    EAListCombo.append(inParaCombos[ixCombo][0 + 2 * ctryIndex[0]])
                    sigmaAListCombo.append(inParaCombos[ixCombo][1 + 2 * ctryIndex[0]])
                    EBListCombo.append(inParaCombos[ixCombo][0 + 2 *ctryIndex[1]])
                    sigmaBListCombo.append(inParaCombos[ixCombo][1 + 2 * ctryIndex[1]])
                self.paraCombos.append(zip(ctry1ListCombo, ctry2ListCombo, EAListCombo, sigmaAListCombo, EBListCombo, sigmaBListCombo))
                ctry1List.extend(ctry1ListCombo)
                ctry2List.extend(ctry2ListCombo)
                EAList.extend(map(str, EAListCombo))
                EBList.extend(map(str, EBListCombo))
                sigmaAList.extend(map(str, sigmaAListCombo))
                sigmaBList.extend(map(str, sigmaBListCombo))



            variables = ['Ctry', 'Ctry', 'EA', 'EB', 'sigmaA', 'sigmaB']
            groups    = [ 0, 0, 0, 0, 0, 0 ]
            groupRestrs = [ 'diagnol', 'diagnol', 'diagnol', 'diagnol', 'diagnol', 'diagnol' ]
            writeVals = [ ", ".join(ctry1List), ", ".join(ctry2List), ", ".join(EAList), ", ".join(EBList), ", ".join(sigmaAList),", ".join(sigmaBList)]
            paraFiles = [ 'input/markovA.in', 'input/markovB.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ]
            paraFileRegex = [ 'space-separated', 'space-separated', 'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated'  ]
            #self.paraCombos = inParaCombos
            self.analyzeResults.tablePath = self.iterationFolder
            # variable list passed to analyzeOverviewTables
            self.variables = ['EA', 'sigmaA', 'EB', 'sigmaB']
            print 'Done setting up one4eachCtry. '




        # Write a para.loop file to generate grid jobs
        para_loop = self.writeParaLoop(variables = variables,
                                       groups = groups,
                                       groupRestrs = groupRestrs,
                                       vals = writeVals,
                                       desPath = os.path.join(self.iterationFolder, 'para.loopTmp'),
                                       paraFiles = paraFiles,
                                       paraFileRegex = paraFileRegex)

        paraLoop_fp.__init__(self, verbosity = self.verbosity)
        tasks = self.generateTaskList(para_loop, self.iterationFolder)
        ParallelTaskCollection.__init__(self, self.jobname, tasks)
Example #41
0
 def __init__(self, num_tasks, **extra_args):
     tasks = [
         SuccessfulApp('stage{n}'.format(n=n)) for n in range(num_tasks)
     ]
     ParallelTaskCollection.__init__(self, tasks, **extra_args)
Example #42
0
    def __init__(self, inParaCombos, iteration, pathToExecutable, pathToStageDir, architecture, baseDir, xVars,
                 solverVerb, problemType, analyzeResults, ctryList, **extra_args):
        
        '''
          Generate a list of tasks and initialize a ParallelTaskCollection with them. 
          Uses paraLoop class to generate a list of (descriptions, substitutions for the input files). Descriptions are generated from
          variable names that are hard coded in this method right now. 
          Uses method generateTaskList to create a list of GPremiumApplication's which are invoked from a list of inputs (appropriately adjusted input files), 
          the output directory and some further settings for each run. 
          
          inParaCombos:      List of tuples defining the parameter combinations.
          iteration:         Current iteration number. 
          pathToExecutable:  Path to the executable (the external program to be called). 
          pathToStageDir:    Root path. Usually os.getcwd()
          architecture:      32 or 64 bit.
          baseDir:           Directory in which the input files are located. 
          xVars:             Names of the x variables. 
          solverVerb:        Logger verbosity. 
          problemType:       Forward premium specific flag to determine which case to look at. 
          analyzeResults:    Function to use to analyze the emerging output. 
          ctryList:          Forward premium specific list of ctrys to look at. 
        '''

        logger.debug('entering gParaSearchParalell.__init__')


        # Set up initial variables and set the correct methods.
        self.pathToStageDir = pathToStageDir
        self.problemType = problemType
        self.executable = pathToExecutable
        self.architecture = architecture
        self.baseDir = baseDir
        self.verbosity = solverVerb.upper()
        self.xVars = xVars
        self.n = len(self.xVars.split())
        self.analyzeResults = analyzeResults
        self.ctryList = ctryList
        self.iteration = iteration
        self.jobname = 'evalSolverGuess' + '-' + extra_args['jobname'] + '-' + str(self.iteration)
        self.extra_args = extra_args
        tasks = []

        # --- createJobs_x ---

        # Log activity
        cDate = datetime.date.today()
        cTime = datetime.datetime.time(datetime.datetime.now())
        dateString = '{0:04d}-{1:02d}-{2:02d}-{3:02d}-{4:02d}-{5:02d}'.format(cDate.year, cDate.month, cDate.day, cTime.hour, cTime.minute, cTime.second)
        logger.debug('Establishing parallel task on %s' % dateString)

        # Enter an iteration specific folder
        self.iterationFolder = os.path.join(self.pathToStageDir, 'Iteration-' + str(self.iteration))
        try:
            os.mkdir(self.iterationFolder)
        except OSError:
            print '%s already exists' % self.iterationFolder

        # save population to file
        np.savetxt(os.path.join(self.iterationFolder, 'curPopulation'), inParaCombos, delimiter = '  ')

        # Take the list of parameter combinations and translate them in a comma separated list of values for each variable to be fed into paraLoop file.
        # This can be done much more elegantly with ','.join() but it works...
        vals = []
        nVariables = range(len(inParaCombos[0]))
        for ixVar in nVariables:
            varValString = ''
            for ixParaCombo, paraCombo in enumerate(inParaCombos):
                ### Should make more precise string conversion.
                varValString += str(paraCombo[ixVar])
                if ixParaCombo < len(inParaCombos) - 1:
                    varValString += ', '
            vals.append( varValString )


        # Make problem specific adjustments to the paraLoop file.
        if self.problemType == 'one4all':
            print 'one4all'
            variables = ['Ctry', 'Ctry', 'EA', 'EB', 'sigmaA', 'sigmaB']
            groups    = [ 0, 0, 1, 1, 1, 1 ]
            groupRestrs = [ 'lowerTr', 'lowerTr', 'diagnol', 'diagnol', 'diagnol', 'diagnol' ]
            writeVals = [ ", ".join(self.ctryList), ", ".join(self.ctryList), vals[0], vals[0], vals[1], vals[1] ]
            self.variables = ['EA','sigmaA']
            self.paraCombos = inParaCombos
            paraFiles = [ 'input/markovA.in', 'input/markovB.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ]
            paraFileRegex = [ 'space-separated', 'space-separated', 'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated'  ]
            self.analyzeResults.tablePath = self.iterationFolder

        elif self.problemType == 'one4eachPair':
            print 'one4eachPair'
            # Check if EA or sigmaA are alone in the specified parameters. If so make diagnol adjustments
            writeVals = []
            if 'EA' in self.xVars and not 'EB' in self.xVars:
                variables = [ 'EA', 'EB' ]
                groups = [ '0', '0' ]
                groupRestrs = [ 'diagnol', 'diagnol' ]

                writeVals.append(vals[0])
                writeVals.append(vals[0])
                paraCombosEA = [  np.append(ele[0], ele[0]) for ele in inParaCombos ]
            if 'sigmaA' in self.xVars and not 'sigmaB' in self.xVars:
                variables.append( 'sigmaA')
                variables.append('sigmaB')
                groups.append( '0')
                groups.append('0')
                groupRestrs.append( 'diagnol')
                groupRestrs.append( 'diagnol' )
                writeVals.append(vals[1])
                writeVals.append(vals[1])
                paraCombosSigmaA = [  np.append(ele[1], ele[1]) for ele in inParaCombos ]

            # match ctry with val
            ctryVals = {}
            for ixCtry, ctry in enumerate(ctryList):
                ctryVals[ctry] = vals

            self.variables = variables

            # Prepare paraCombos matching to resulting table. Used in analyzeOverviewTable
            # !!! This should be dependent on problem type or on missing variables in xvars. !!!
            paraCombos = []
            for EA,sA in zip(paraCombosEA, paraCombosSigmaA):
                paraCombo = np.append(EA, sA)
                paraCombos.append(paraCombo)
            self.paraCombos = paraCombos
            paraFiles = [ 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ]
            paraFileRegex = [  'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated'  ]

        elif self.problemType == 'one4eachCtry':
            print 'one4eachCtry'

            ctry1List  = []
            ctry2List  = []
            EAList     = []
            EBList     = []
            sigmaAList = []
            sigmaBList = []
            self.paraCombos = []

            ctryIndices = getIndex([len(ctryList), len(ctryList)], 'lowerTr')
            for ixCombo in range(len(inParaCombos)):
                ctry1ListCombo = []
                ctry2ListCombo = []
                EAListCombo    = []
                EBListCombo    = []
                sigmaAListCombo = []
                sigmaBListCombo = []
                for ctryIndex in ctryIndices:
                    ctry1ListCombo.append(ctryList[ctryIndex[0]])
                    ctry2ListCombo.append(ctryList[ctryIndex[1]])
                    EAListCombo.append(inParaCombos[ixCombo][0 + 2 * ctryIndex[0]])
                    sigmaAListCombo.append(inParaCombos[ixCombo][1 + 2 * ctryIndex[0]])
                    EBListCombo.append(inParaCombos[ixCombo][0 + 2 *ctryIndex[1]])
                    sigmaBListCombo.append(inParaCombos[ixCombo][1 + 2 * ctryIndex[1]])
                self.paraCombos.append(zip(ctry1ListCombo, ctry2ListCombo, EAListCombo, sigmaAListCombo, EBListCombo, sigmaBListCombo))
                ctry1List.extend(ctry1ListCombo)
                ctry2List.extend(ctry2ListCombo)
                EAList.extend(map(str, EAListCombo))
                EBList.extend(map(str, EBListCombo))
                sigmaAList.extend(map(str, sigmaAListCombo))
                sigmaBList.extend(map(str, sigmaBListCombo))



            variables = ['Ctry', 'Ctry', 'EA', 'EB', 'sigmaA', 'sigmaB']
            groups    = [ 0, 0, 0, 0, 0, 0 ]
            groupRestrs = [ 'diagnol', 'diagnol', 'diagnol', 'diagnol', 'diagnol', 'diagnol' ]
            writeVals = [ ", ".join(ctry1List), ", ".join(ctry2List), ", ".join(EAList), ", ".join(EBList), ", ".join(sigmaAList),", ".join(sigmaBList)]
            paraFiles = [ 'input/markovA.in', 'input/markovB.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ]
            paraFileRegex = [ 'space-separated', 'space-separated', 'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated'  ]
            #self.paraCombos = inParaCombos
            self.analyzeResults.tablePath = self.iterationFolder
            # variable list passed to analyzeOverviewTables
            self.variables = ['EA', 'sigmaA', 'EB', 'sigmaB']
            print 'Done setting up one4eachCtry. '




        # Write a para.loop file to generate grid jobs
        para_loop = self.writeParaLoop(variables = variables,
                                       groups = groups,
                                       groupRestrs = groupRestrs,
                                       vals = writeVals,
                                       desPath = os.path.join(self.iterationFolder, 'para.loopTmp'),
                                       paraFiles = paraFiles,
                                       paraFileRegex = paraFileRegex)

        paraLoop_fp.__init__(self, verbosity = self.verbosity)
        tasks = self.generateTaskList(para_loop, self.iterationFolder)
        ParallelTaskCollection.__init__(self, self.jobname, tasks)
Example #43
0
 def __init__(self, num_tasks, **extra_args):
     tasks = [SuccessfulApp('stage{n}'.format(n=n)) for n in range(num_tasks)]
     ParallelTaskCollection.__init__(self, tasks, **extra_args)
Example #44
0
    def stage1(self):
        """
        Start this stage IIF stage0 all completed (i.e. no failures)
        combine all .g.vcf files alltogether
        group in blocks (e.g. 30 out of the total 300)
        * make grouping an option for stage1
        * Use same GATK and goat.genome vesion as in stage0
        Run "combine_gvcf" script
        script can take an arbitrary number of gvc files and prodices
        1 single gvcf file
        end of stage1: 10 .g.vcf files
        if fails - because of heap size - then re-run with more memory
        Walltime: 2days each
        Cores requires: 10 cores
        Memory 500GB memory top - need to check
        memory: 128GB
        Example script:
java -jar  /home/dleigh/GenomeAnalysisTK-3.1-1/GenomeAnalysisTK-3.4-46/GenomeAnalysisTK.jar \
    -T CombineGVCFs \
    -R /home/dleigh/goatgenome/01.GENOME/scaffold/goat_scaffoldFG_V1.1.normalised.22.07.fa \
--variant /home/dleigh/demultiplexed.reads/GATK/GR0766.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1380.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1387.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1390.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1422.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1424.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1440.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1441.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1709.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1728.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1938.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1939.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1997.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR2001.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR2053.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR2055.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR2056.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0038.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0047.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0101.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0242.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0258.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0261.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0306.g.vcf \
-o /home/dleigh/demultiplexed.reads/GATK/combined3.g.vcf

        get list of all outputs in 'outputs0' folder
        group them in 's1_chunk'
        for each group run GATKS1Application
        """
        # XXX: add check if stage0 completed properly
        # Stop otherwise

        tasks = []

        for (vcf_group,index) in get_vcf_group(self.extra['S0_output'],
                                               int(self.extra['S1_group'])):
            extra_args = self.extra.copy()
            extra_args['jobname'] = "gatk-s1-%d" % index

            extra_args['output_dir'] = extra_args['output_dir'].replace('NAME',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('SESSION',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('DATE',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('TIME',
                                                                        extra_args['jobname'])

            gc3libs.log.debug("Creating Stage1 task for : %d" %
                              index)

            tasks.append(GATKS1Application(
                vcf_group,
                index,
                **extra_args))

        return ParallelTaskCollection(tasks)
Example #45
0
    def __init__(self, executable, input_values_file,
                 iteration, total_iterations,
                 slice_size=0, datadir=TMPDIR, extra={ },
                 parent=None):
        """
        Create a new tasks that runs `executable` over the set of
        values contained in file `input_values_file` (one
        floating-point number per line).

        If `slice_size` is a positive integer, then chop the input into
        chunks of -at most- the given size and compute them as separate
        independent jobs.

        Any other argument is passed unchanged to the
        `ParallelTaskCollection` ctor.
        """
        assert slice_size >= 0, \
               "Argument `slice_size` to ValueFunctionIterationPass.__init__" \
               " must be a non-negative integer."
        assert isinstance(extra, dict), \
               "Argument `extra` to ValueFunctionIterationPass.__init__" \
               " must be a dictionary instance."

        self.input_values = input_values_file
        self.output_values = None

        total_input_values = _count_input_values(input_values_file)

        if slice_size < 1:
            # trick to make the for-loop below work in the case of one
            # slice only
            slice_size = total_input_values

        # pad numbers with correct amount of zeros, so they look
        # sorted in plain `ls -l` output
        fmt = '%%0%dd' % (1 + int(math.log10(float(total_iterations))))
        self.jobname = ("%s.%s"
                        % ((parent or gc3libs.utils.basename_sans(input_values_file)),
                           (fmt % iteration)))

        # create data sub-directory
        datasubdir = os.path.join(datadir, self.jobname)
        if not os.path.exists(datasubdir):
            os.makedirs(datasubdir)

        # build list of tasks
        tasks = [ ]
        for start in range(0, total_input_values, slice_size):
            # create new job to handle this slice of values
            extra_args = extra.copy()
            extra_args['parent'] = self.jobname
            tasks.append(
                ValueFunctionIterationApplication(
                    executable,
                    input_values_file,
                    iteration,
                    total_iterations,
                    # each task computes values with i in range
                    # `start..end` (inclusive), and `end` is
                    # generally `slice_size` elements after `start`
                    start,
                    end=min(start + slice_size - 1, total_input_values),
                    output_dir = datasubdir,
                    **extra_args
                    )
                )

        # actually init jobs
        ParallelTaskCollection.__init__(self, self.jobname, tasks)
Example #46
0
    def __init__(self, executable, input_values_file,
                 iteration, total_iterations,
                 slice_size=0, datadir=TMPDIR, extra={ },
                 parent=None):
        """
        Create a new tasks that runs `executable` over the set of
        values contained in file `input_values_file` (one
        floating-point number per line).

        If `slice_size` is a positive integer, then chop the input into
        chunks of -at most- the given size and compute them as separate
        independent jobs.

        Any other argument is passed unchanged to the
        `ParallelTaskCollection` ctor.
        """
        assert slice_size >= 0, \
               "Argument `slice_size` to ValueFunctionIterationPass.__init__" \
               " must be a non-negative integer."
        assert isinstance(extra, dict), \
               "Argument `extra` to ValueFunctionIterationPass.__init__" \
               " must be a dictionary instance."

        self.input_values = input_values_file
        self.output_values = None

        total_input_values = _count_input_values(input_values_file)

        if slice_size < 1:
            # trick to make the for-loop below work in the case of one
            # slice only
            slice_size = total_input_values

        # pad numbers with correct amount of zeros, so they look
        # sorted in plain `ls -l` output
        fmt = '%%0%dd' % (1 + int(math.log10(float(total_iterations))))
        self.jobname = ("%s.%s"
                        % ((parent or gc3libs.utils.basename_sans(input_values_file)),
                           (fmt % iteration)))

        # create data sub-directory
        datasubdir = os.path.join(datadir, self.jobname)
        if not os.path.exists(datasubdir):
            os.makedirs(datasubdir)

        # build list of tasks
        tasks = [ ]
        for start in range(0, total_input_values, slice_size):
            # create new job to handle this slice of values
            extra_args = extra.copy()
            extra_args['parent'] = self.jobname
            tasks.append(
                ValueFunctionIterationApplication(
                    executable,
                    input_values_file,
                    iteration,
                    total_iterations,
                    # each task computes values with i in range
                    # `start..end` (inclusive), and `end` is
                    # generally `slice_size` elements after `start`
                    start,
                    end=min(start + slice_size - 1, total_input_values),
                    output_dir = datasubdir,
                    **extra_args
                    )
                )

        # actually init jobs
        ParallelTaskCollection.__init__(self, self.jobname, tasks)