Exemple #1
0
    def _start_merger(self):
        '''Blocks until the two "subjobs" are done 
        (with new implementation, they will be done, before this is even called)
            creates the merger job and puts it in the jobs queue, 
            cleans up the alignment subdirectories,
            signals an event that signifies the fact that the merge job is on queue,
            and then returns.
        
        Called by wait()
        '''
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")
        assert (self.subjob1 is not None)
        result1 = self.subjob1.get_results()
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")
        self.subjob1 = None
        assert (self.subjob2 is not None)
        result2 = self.subjob2.get_results()
        self.subjob2 = None
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")
        assert (result1.get_num_loci() == result2.get_num_loci())

        mj_list = []
        for n, r1 in enumerate(result1):
            r2 = result2[n]
            cs = self.context_str + " merger" + str(n)
            mj = self.pasta_team.merger.create_job(
                r1,
                r2,
                tmp_dir_par=self.tmp_dir_par,
                delete_temps=self.delete_temps,
                context_str=cs)
            mj.add_parent_tickable_job(self)
            self.add_child(mj)

            if self.killed:
                raise RuntimeError("PastaAligner Job killed")
            mj_list.append(mj)

        self.merge_job_list = mj_list
        for mj in mj_list:
            jobq.put(mj)

        if self.delete_temps:
            for d in self._dirs_to_cleanup:
                self.pasta_team.temp_fs.remove_dir(d)

        self._merge_queued_event.set()
Exemple #2
0
    def _start_merger(self):
        '''Blocks until the two "subjobs" are done 
        (with new implementation, they will be done, before this is even called)
            creates the merger job and puts it in the jobs queue, 
            cleans up the alignment subdirectories,
            signals an event that signifies the fact that the merge job is on queue,
            and then returns.
        
        Called by wait()
        '''
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")
        assert(self.subjob1 is not None)
        result1 = self.subjob1.get_results()
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")
        self.subjob1 = None
        assert(self.subjob2 is not None)
        result2 = self.subjob2.get_results()
        self.subjob2 = None
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")
        assert(result1.get_num_loci() == result2.get_num_loci())
        
        mj_list = []
        for n, r1 in enumerate(result1):
            r2 = result2[n]
            cs = self.context_str + " merger" + str(n)
            mj = self.pasta_team.merger.create_job(r1,
                                                  r2,
                                                  tmp_dir_par=self.tmp_dir_par,
                                                  delete_temps=self.delete_temps,
                                                  context_str=cs)
            mj.add_parent_tickable_job(self)
            self.add_child(mj)
                        
            if self.killed:
                raise RuntimeError("PastaAligner Job killed")
            mj_list.append(mj)

        self.merge_job_list = mj_list
        for mj in mj_list:
            jobq.put(mj)

        if self.delete_temps:
            for d in self._dirs_to_cleanup:
                self.pasta_team.temp_fs.remove_dir(d)

        self._merge_queued_event.set()
Exemple #3
0
    def run(self, tmp_dir_par, pasta_products=None):
        assert(os.path.exists(tmp_dir_par))

        self._reset_current_run_settings()
        self._reset_jobs()

        self.start_time = time.time()
        self.last_improvement_time = self.start_time

        num_non_update_iter = 0

        configuration = self.configuration()
        # Here we check if the max_subproblem_frac is more stringent than max_subproblem_size
        frac_max = int(math.ceil(self.max_subproblem_frac*self.tree.n_leaves))
        if frac_max > self.max_subproblem_size:
            configuration['max_subproblem_size'] = frac_max
        MESSENGER.send_info('Max subproblem set to {0}'.format(
                configuration['max_subproblem_size']))
        if configuration['max_subproblem_size'] >= self.tree.n_leaves:
            MESSENGER.send_warning('''\n
WARNING: you have specified a max subproblem ({0}) that is equal to or greater
    than the number of taxa ({0}). Thus, the PASTA algorithm will not be invoked
    under the current configuration (i.e., no tree decomposition will occur).
    If you did not intend for this behavior (which you probably did not since
    you are using PASTA) please adjust your settings for the max subproblem and
    try running PASTA again. If you intended to use PASTA to align your data with
    the specified aligner tool *without* any decomposition, you can ignore this
    message.\n'''.format(configuration['max_subproblem_size'],
                       self.tree.n_leaves))
        if configuration['max_subproblem_size'] == 1:
             MESSENGER.send_error(''' You have specified a max subproblem size of 1. PASTA requires a max subproblem size of at least 2.  ''')
             sys.exit(1)

        delete_iteration_temps = not self.keep_iteration_temporaries
        delete_realignment_temps = delete_iteration_temps or (not self.keep_realignment_temporaries)
        configuration['delete_temps'] = delete_realignment_temps

        while self._keep_iterating():
            record_timestamp(os.path.join(tmp_dir_par, 'start_pastaiter_timestamp.txt'))

            # create a subdirectory for this iteration
            curr_iter_tmp_dir_par = os.path.join(tmp_dir_par, 'step' + str(self.current_iteration))
            curr_iter_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(curr_iter_tmp_dir_par)
            _LOG.debug('directory %s created' % curr_iter_tmp_dir_par)
            break_strategy_index = 0
            this_iter_score_improved = False

            while True:
                break_strategy =  self._get_break_strategy(break_strategy_index)
                if not bool(break_strategy):
                    break
                context_str = "iter%d-%s" % (self.current_iteration, break_strategy)
                # create a subdirectory for this iteration/break_strategy
                curr_tmp_dir_par = os.path.join(curr_iter_tmp_dir_par, break_strategy)
                curr_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(curr_tmp_dir_par)
                record_timestamp(os.path.join(curr_tmp_dir_par, 'start_align_timestamp.txt'))
                # Align (with decomposition...)
                self.status('Step %d. Realigning with decomposition strategy set to %s' % (self.current_iteration, break_strategy))
                if self.killed:
                    raise RuntimeError("PASTA Job killed")
                tree_for_aligner = self.get_tree_copy()
                aligner = PASTAAlignerJob(multilocus_dataset=self.multilocus_dataset,
                                         pasta_team=self.pasta_team,
                                         tree=tree_for_aligner,
                                         tmp_base_dir=curr_tmp_dir_par,
                                         reset_recursion_index=True,
                                         skip_merge=self.pastamerge,
                                         **configuration)
                self.pasta_aligner_job = aligner
                aligner.launch_alignment(break_strategy=break_strategy,
                                         context_str=context_str)                
                if self.pastamerge:
                    _LOG.debug("Build PASTA merge jobs")
                    subsets_tree = self.build_subsets_tree(curr_tmp_dir_par)
                    if len(self.pasta_team.subsets.values()) == 1:
                        # can happen if there are no decompositions
                        for job in self.pasta_team.alignmentjobs:
                            jobq.put(job)
                        new_multilocus_dataset = self.pasta_team.subsets.values()[0].get_results()
                    else:
                        pariwise_tmp_dir_par = os.path.join(curr_tmp_dir_par, "pw")
                        pariwise_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(pariwise_tmp_dir_par)    
                        pmj = PASTAMergerJob(multilocus_dataset=self.multilocus_dataset,
                                             pasta_team=self.pasta_team,
                                             tree=subsets_tree,
                                             tmp_base_dir=pariwise_tmp_dir_par,
                                             reset_recursion_index=True,   
                                             #delete_temps2=False,                                      
                                             **configuration)
                                                
                        pmj.launch_alignment(context_str=context_str)
                        
                        # Start alignment jobs
                        for job in self.pasta_team.alignmentjobs:
                            jobq.put(job)
                            
                            
                        new_multilocus_dataset = pmj.get_results()
                        del pmj  
                    
                    self.pasta_team.alignmentjobs = []
                    self.pasta_team.subsets = {}                                                                  
                else:          
                    new_multilocus_dataset = aligner.get_results()
                
                _LOG.debug("Alignment obtained. Preparing for tree.")
                self.pasta_aligner_job = None
                del aligner

                record_timestamp(os.path.join(curr_tmp_dir_par, 'start_treeinference_timestamp.txt'))
                # Tree inference
                if self.start_tree_search_from_current:
                    start_from = self.tree
                else:
                    start_from = None
                self.status('Step %d. Alignment obtained. Tree inference beginning...' % (self.current_iteration))
                if self.killed:
                    raise RuntimeError("PASTA Job killed")                             
            
                tbj = self.pasta_team.tree_estimator.create_job(new_multilocus_dataset,
                                                               starting_tree=start_from,
                                                               num_cpus=self.num_cpus,
                                                               context_str=context_str + " tree",
                                                               tmp_dir_par=curr_tmp_dir_par,
                                                               delete_temps=delete_iteration_temps,
                                                               pasta_products=pasta_products,
                                                               step_num=self.current_iteration,
                                                               mask_gappy_sites = self.mask_gappy_sites)
                prev_curr_align = self.curr_iter_align_tmp_filename
                prev_curr_tree = self.curr_iter_tree_tmp_filename
                self.curr_iter_align_tmp_filename = pasta_products.get_abs_path_for_iter_output(self.current_iteration, TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True)
                self.curr_iter_tree_tmp_filename = pasta_products.get_abs_path_for_iter_output(self.current_iteration, TEMP_TREE_TAG, allow_existing=True)

                self.tree_build_job = tbj
                jobq.put(tbj)
                new_score, new_tree_str = tbj.get_results()
                self.tree_build_job = None
                del tbj
                if self.killed:
                    raise RuntimeError("PASTA Job killed")

                record_timestamp(os.path.join(curr_tmp_dir_par, 'end_treeinference_timestamp.txt'))
                curr_timestamp = time.time()
                accept_iteration = False

                if self.score is None:
                    self.score = new_score

                if self.best_score is None or new_score > self.best_score:
                    self.store_optimum_results(new_multilocus_dataset,
                            new_tree_str,
                            new_score,
                            curr_timestamp)
                    this_iter_score_improved = True
                    accept_iteration = True

                if self._get_accept_mode(new_score=new_score, break_strategy_index=break_strategy_index) == AcceptMode.BLIND_MODE:
                    if self.blind_mode_is_final:
                        self.is_stuck_in_blind = True
                        if self.switch_to_blind_timestamp is None:
                            if self._blindmode_trigger:
                                _LOG.debug("Blind runmode trigger = %s" % self._blindmode_trigger)
                            self.switch_to_blind_iter = self.current_iteration
                            self.switch_to_blind_timestamp = curr_timestamp
                    accept_iteration = True

                if accept_iteration:
                    self.score = new_score
                    self.multilocus_dataset = new_multilocus_dataset
                    self.tree_str = new_tree_str
                    if this_iter_score_improved:
                        self.status('realignment accepted and score improved.')
                    else:
                        self.status('realignment accepted and despite the score not improving.')
                    # we do not want to continue to try different breaking strategies for this iteration so we break
                    self.status('current score: %s, best score: %s' % (self.score, self.best_score) )
                    break
                else:
                    self.status('realignment NOT accepted.')
                    self.curr_iter_align_tmp_filename = prev_curr_align
                    self.curr_iter_tree_tmp_filename = prev_curr_tree 

                break_strategy_index += 1

                # self.status('current score: %s, best score: %s' % (self.score, self.best_score) )
                
            if not this_iter_score_improved:
                self.num_iter_since_imp += 1
            self.current_iteration += 1

        if self._termination_trigger:
            _LOG.debug("Termination trigger = %s" % self._termination_trigger)
        record_timestamp(os.path.join(tmp_dir_par, 'end_pastaiter_timestamp.txt'))

        ### TODO: if configuration is 'return_final_iter_TreeAndAlignpair', then skip the following three lines
        if not self.return_final_tree_and_alignment:
            self.multilocus_dataset = self.best_multilocus_dataset.new_with_shared_meta()
            for locus_alignment in self.best_multilocus_dataset:
                self.multilocus_dataset.append(copy.copy(locus_alignment))
            self.tree_str = self.best_tree_str
            self.score = self.best_score
        else:
            assert self.multilocus_dataset is not None
            assert self.tree_str is not None
            assert self.score is not None
Exemple #4
0
    def run(self, tmp_dir_par, pasta_products=None):
        assert (os.path.exists(tmp_dir_par))

        self._reset_current_run_settings()
        self._reset_jobs()

        self.start_time = time.time()
        self.last_improvement_time = self.start_time

        num_non_update_iter = 0

        configuration = self.configuration()
        # Here we check if the max_subproblem_frac is more stringent than max_subproblem_size
        frac_max = int(math.ceil(self.max_subproblem_frac *
                                 self.tree.n_leaves))
        if frac_max > self.max_subproblem_size:
            configuration['max_subproblem_size'] = frac_max
        MESSENGER.send_info('Max subproblem set to {0}'.format(
            configuration['max_subproblem_size']))
        if configuration['max_subproblem_size'] >= self.tree.n_leaves:
            MESSENGER.send_warning('''\n
WARNING: you have specified a max subproblem ({0}) that is equal to or greater
    than the number of taxa ({0}). Thus, the PASTA algorithm will not be invoked
    under the current configuration (i.e., no tree decomposition will occur).
    If you did not intend for this behavior (which you probably did not since
    you are using PASTA) please adjust your settings for the max subproblem and
    try running PASTA again. If you intended to use PASTA to align your data with
    the specified aligner tool *without* any decomposition, you can ignore this
    message.\n'''.format(configuration['max_subproblem_size'],
                         self.tree.n_leaves))
        if configuration['max_subproblem_size'] == 1:
            MESSENGER.send_error(
                ''' You have specified a max subproblem size of 1. PASTA requires a max subproblem size of at least 2.  '''
            )
            sys.exit(1)

        delete_iteration_temps = not self.keep_iteration_temporaries
        delete_realignment_temps = delete_iteration_temps or (
            not self.keep_realignment_temporaries)
        configuration['delete_temps'] = delete_realignment_temps

        while self._keep_iterating():
            record_timestamp(
                os.path.join(tmp_dir_par, 'start_pastaiter_timestamp.txt'))

            # create a subdirectory for this iteration
            curr_iter_tmp_dir_par = os.path.join(
                tmp_dir_par, 'step' + str(self.current_iteration))
            curr_iter_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(
                curr_iter_tmp_dir_par)
            _LOG.debug('directory %s created' % curr_iter_tmp_dir_par)
            break_strategy_index = 0
            this_iter_score_improved = False

            while True:
                break_strategy = self._get_break_strategy(break_strategy_index)
                if not bool(break_strategy):
                    break
                context_str = "iter%d-%s" % (self.current_iteration,
                                             break_strategy)
                # create a subdirectory for this iteration/break_strategy
                curr_tmp_dir_par = os.path.join(curr_iter_tmp_dir_par,
                                                break_strategy)
                curr_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(
                    curr_tmp_dir_par)
                record_timestamp(
                    os.path.join(curr_tmp_dir_par,
                                 'start_align_timestamp.txt'))
                # Align (with decomposition...)
                self.status(
                    'Step %d. Realigning with decomposition strategy set to %s'
                    % (self.current_iteration, break_strategy))
                if self.killed:
                    raise RuntimeError("PASTA Job killed")
                tree_for_aligner = self.get_tree_copy()
                aligner = PASTAAlignerJob(
                    multilocus_dataset=self.multilocus_dataset,
                    pasta_team=self.pasta_team,
                    tree=tree_for_aligner,
                    tmp_base_dir=curr_tmp_dir_par,
                    reset_recursion_index=True,
                    skip_merge=self.pastamerge,
                    **configuration)
                self.pasta_aligner_job = aligner
                aligner.launch_alignment(break_strategy=break_strategy,
                                         context_str=context_str)
                if self.pastamerge:
                    _LOG.debug("Build PASTA merge jobs")
                    subsets_tree = self.build_subsets_tree(
                        curr_tmp_dir_par, self.build_MST)
                    if len(self.pasta_team.subsets) == 1:
                        # can happen if there are no decompositions
                        for job in self.pasta_team.alignmentjobs:
                            jobq.put(job)
                        new_multilocus_dataset = list(
                            self.pasta_team.subsets.values())[0].get_results()
                    else:
                        pariwise_tmp_dir_par = os.path.join(
                            curr_tmp_dir_par, "pw")
                        pariwise_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(
                            pariwise_tmp_dir_par)
                        pmj = PASTAMergerJob(
                            multilocus_dataset=self.multilocus_dataset,
                            pasta_team=self.pasta_team,
                            tree=subsets_tree,
                            tmp_base_dir=pariwise_tmp_dir_par,
                            reset_recursion_index=True,
                            #delete_temps2=False,
                            **configuration)

                        pmj.launch_alignment(context_str=context_str)

                        # Start alignment jobs
                        for job in self.pasta_team.alignmentjobs:
                            jobq.put(job)

                        new_multilocus_dataset = pmj.get_results()
                        del pmj

                    self.pasta_team.alignmentjobs = []
                    self.pasta_team.subsets = {}
                else:
                    new_multilocus_dataset = aligner.get_results()

                _LOG.debug("Alignment obtained. Preparing for tree.")
                self.pasta_aligner_job = None
                del aligner

                record_timestamp(
                    os.path.join(curr_tmp_dir_par,
                                 'start_treeinference_timestamp.txt'))
                # Tree inference
                if self.start_tree_search_from_current:
                    start_from = self.tree
                else:
                    start_from = None
                self.status(
                    'Step %d. Alignment obtained. Tree inference beginning...'
                    % (self.current_iteration))
                if self.killed:
                    raise RuntimeError("PASTA Job killed")

                tbj = self.pasta_team.tree_estimator.create_job(
                    new_multilocus_dataset,
                    starting_tree=start_from,
                    num_cpus=self.num_cpus,
                    context_str=context_str + " tree",
                    tmp_dir_par=curr_tmp_dir_par,
                    delete_temps=delete_iteration_temps,
                    pasta_products=pasta_products,
                    step_num=self.current_iteration,
                    mask_gappy_sites=self.mask_gappy_sites)
                prev_curr_align = self.curr_iter_align_tmp_filename
                prev_curr_tree = self.curr_iter_tree_tmp_filename
                self.curr_iter_align_tmp_filename = pasta_products.get_abs_path_for_iter_output(
                    self.current_iteration,
                    TEMP_SEQ_ALIGNMENT_TAG,
                    allow_existing=True)
                self.curr_iter_tree_tmp_filename = pasta_products.get_abs_path_for_iter_output(
                    self.current_iteration, TEMP_TREE_TAG, allow_existing=True)

                self.tree_build_job = tbj
                jobq.put(tbj)
                new_score, new_tree_str = tbj.get_results()
                self.tree_build_job = None
                del tbj
                if self.killed:
                    raise RuntimeError("PASTA Job killed")

                record_timestamp(
                    os.path.join(curr_tmp_dir_par,
                                 'end_treeinference_timestamp.txt'))
                curr_timestamp = time.time()
                accept_iteration = False

                if self.score is None:
                    self.score = new_score

                if self.best_score is None or new_score > self.best_score:
                    self.store_optimum_results(new_multilocus_dataset,
                                               new_tree_str, new_score,
                                               curr_timestamp)
                    this_iter_score_improved = True
                    accept_iteration = True

                if self._get_accept_mode(
                        new_score=new_score,
                        break_strategy_index=break_strategy_index
                ) == AcceptMode.BLIND_MODE:
                    if self.blind_mode_is_final:
                        self.is_stuck_in_blind = True
                        if self.switch_to_blind_timestamp is None:
                            if self._blindmode_trigger:
                                _LOG.debug("Blind runmode trigger = %s" %
                                           self._blindmode_trigger)
                            self.switch_to_blind_iter = self.current_iteration
                            self.switch_to_blind_timestamp = curr_timestamp
                    accept_iteration = True

                if accept_iteration:
                    self.score = new_score
                    self.multilocus_dataset = new_multilocus_dataset
                    self.tree_str = new_tree_str
                    if this_iter_score_improved:
                        self.status('realignment accepted and score improved.')
                    else:
                        self.status(
                            'realignment accepted and despite the score not improving.'
                        )
                    # we do not want to continue to try different breaking strategies for this iteration so we break
                    self.status('current score: %s, best score: %s' %
                                (self.score, self.best_score))
                    break
                else:
                    self.status('realignment NOT accepted.')
                    self.curr_iter_align_tmp_filename = prev_curr_align
                    self.curr_iter_tree_tmp_filename = prev_curr_tree

                break_strategy_index += 1

                # self.status('current score: %s, best score: %s' % (self.score, self.best_score) )

            if not this_iter_score_improved:
                self.num_iter_since_imp += 1
            self.current_iteration += 1

        if self._termination_trigger:
            _LOG.debug("Termination trigger = %s" % self._termination_trigger)
        record_timestamp(
            os.path.join(tmp_dir_par, 'end_pastaiter_timestamp.txt'))

        ### TODO: if configuration is 'return_final_iter_TreeAndAlignpair', then skip the following three lines
        if not self.return_final_tree_and_alignment:
            self.multilocus_dataset = self.best_multilocus_dataset.new_with_shared_meta(
            )
            for locus_alignment in self.best_multilocus_dataset:
                self.multilocus_dataset.append(copy.copy(locus_alignment))
            self.tree_str = self.best_tree_str
            self.score = self.best_score
        else:
            assert self.multilocus_dataset is not None
            assert self.tree_str is not None
            assert self.score is not None
Exemple #5
0
 def run(self, *args, **kwargs):
     start_worker(1)
     job = self.create_job(*args, **kwargs)
     jobq.put(job)
     return job.get_results()
Exemple #6
0
    def launch_alignment(self,
                         tree=None,
                         break_strategy=None,
                         context_str=None):
        '''Puts a alignment job(s) in the queue and then return None
        
        get_results() must be called to get the alignment. Note that this call 
        may not be trivial in terms of time (the tree will be decomposed, lots
        of temporary files may be written...), but the call does not block until
        completion of the alignments.
        Rather it queues the alignment jobs so that multiple processors can be 
        exploited if they are available.
        '''
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")

        if break_strategy is not None:
            self.break_strategy = break_strategy
        break_strategy = self.break_strategy
        if tree is not None:
            self.tree = tree
        self.expected_number_of_taxa = self.multilocus_dataset.get_num_taxa(
        )  # for debugging purposes
        self._reset_jobs()
        prefix = "self.multilocus_dataset.get_num_taxa = %d" % self.expected_number_of_taxa
        self.context_str = context_str
        if self.context_str is None:
            self.context_str = ''
        _LOG.debug(
            "Comparing expected_number_of_taxa=%d and max_subproblem_size=%d\n"
            % (self.expected_number_of_taxa, self.max_subproblem_size))

        if self.expected_number_of_taxa <= self.max_subproblem_size:
            _LOG.debug("%s...Calling Aligner" % prefix)
            aj_list = []
            for index, single_locus_sd in enumerate(self.multilocus_dataset):
                aj = self.pasta_team.aligner.create_job(
                    single_locus_sd,
                    tmp_dir_par=self.tmp_dir_par,
                    delete_temps=self.delete_temps,
                    context_str=self.context_str + " align" + str(index))
                aj.add_parent_tickable_job(self)
                self.add_child(aj)

                aj_list.append(aj)
                if self.killed:
                    raise RuntimeError("PastaAligner Job killed")

                self.pasta_team.alignmentjobs.append(aj)

            self.align_job_list = aj_list

            if self.skip_merge:
                for taxa in self.tree.leaf_node_names():
                    self.pasta_team.subsets[taxa] = self
            else:
                for aj in aj_list:
                    jobq.put(aj)
        else:
            # added by uym2 on August 1st 2017
            subjob1, subjob2 = self.bipartition_by_tree(break_strategy)
            if subjob1 is None or subjob2 is None:
                return
            _LOG.debug("%s...Recursing" % prefix)
            # create the subjobs
            # the next line was modified by uym2 (August 1st 2017)
            self.subjob1 = subjob1
            self.subjob2 = subjob2
            # store this dir so we can use it in the merger
            if self.killed:
                raise RuntimeError("PastaAligner Job killed")

            self.subjob1.add_parent(self)
            self.subjob2.add_parent(self)
            self.add_child(self.subjob1)
            self.add_child(self.subjob2)

            self.subjob1.launch_alignment(break_strategy=break_strategy)
            if self.killed:
                raise RuntimeError("PastaAligner Job killed")
            self.subjob2.launch_alignment(break_strategy=break_strategy)
            if self.killed:
                raise RuntimeError("PastaAligner Job killed")
Exemple #7
0
    def run(self, tmp_dir_par, pasta_products=None):
        '''
        This is to be called from the main pasta method ONLY
        '''
        if self.tmp_dir_par==None:
            self.tmp_dir_par=tmp_dir_par
        else:
            tmp_dir_par=self.tmp_dir_par

        configuration, delete_iteration_temps=self.run_start(tmp_dir_par, pasta_products)
        # self.resumable=False


        while self._keep_iterating():
            if self.resumable==False:
                record_timestamp(os.path.join(tmp_dir_par, 'start_pastaiter_timestamp.txt'))

                # create a subdirectory for this iteration
                curr_iter_tmp_dir_par = os.path.join(tmp_dir_par, 'step' + str(self.current_iteration))
                curr_iter_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(curr_iter_tmp_dir_par)
                _LOG.debug('directory %s created' % curr_iter_tmp_dir_par)
                self.break_strategy_index = 0
                this_iter_score_improved = False


                break_strategy =  self._get_break_strategy(self.break_strategy_index)
                if not bool(break_strategy):
                    print "breaking from PastaInteruptableJob..."
                    break
                context_str = "iter%d-%s" % (self.current_iteration, break_strategy)
                self.context_str=context_str
                # create a subdirectory for this iteration/break_strategy
                curr_tmp_dir_par = os.path.join(curr_iter_tmp_dir_par, break_strategy)
                curr_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(curr_tmp_dir_par)
                self.curr_tmp_dir_par=curr_tmp_dir_par

                record_timestamp(os.path.join(curr_tmp_dir_par, 'start_align_timestamp.txt'))
                # Align (with decomposition...)
                self.status('Step %d. Realigning with decomposition strategy set to %s' % (self.current_iteration, break_strategy))
                if self.killed:
                    raise RuntimeError("PASTA Job killed")
                tree_for_aligner = self.get_tree_copy()
                aligner = PASTAInterruptibleAlignerJob(multilocus_dataset=self.multilocus_dataset,
                                         pasta_team=self.pasta_team,
                                         tree=tree_for_aligner,
                                         tmp_base_dir=curr_tmp_dir_par,
                                         reset_recursion_index=True,
                                         skip_merge=self.pastamerge,
                                         **configuration)
                self.pasta_aligner_job = aligner
                aligner.launch_alignment(break_strategy=break_strategy,
                                         context_str=context_str)

                # write jobs list
                aln_job_list=pasta_products.get_abs_path_for_iter_output(self.current_iteration,'alnjoblist.txt')
                aln_job_list_file=open(aln_job_list,'w')
                self.aln_job_list_dict=[]
                for aj in self.pasta_team.alignmentjobs:
                    self.aln_job_list_dict.append({
                        'file_read_job':True,
                        'alignedfn':aj.alignedfn,
                        'seqfn': aj.seqfn,
                        'scratch_dir': aj.scratch_dir,
                        'datatype': aj.datatype,
                        'context_str': aj.context_str
                    })
                    aln_job_list_file.write('%s,%s,%s\n' % (aj.scratch_dir, aj.seqfn, aj.alignedfn))
                aln_job_list_file.close()
                self.resumable=True
                # self.pasta_team.alignmentjobs=[]
                return aln_job_list, self.resumable
            else:
                # for ajd in self.aln_job_list_dict:
                #     self.pasta_team.alignmentjobs.append(self.pasta_team.aligner.create_file_read_job(**ajd))
                # for aj in self.pasta_team.alignmentjobs:
                #     jobq.put(aj)
                self.resumable=False



            if self.resumable==False:
                # re-initialize some variables from earlier
                aligner=self.pasta_aligner_job
                curr_tmp_dir_par=self.curr_tmp_dir_par
                context_str=self.context_str


                if self.pastamerge:
                    # pdb.set_trace()
                    _LOG.debug("Build PASTA merge jobs")
                    subsets_tree = self.build_subsets_tree(curr_tmp_dir_par)
                    if len(self.pasta_team.subsets.values()) == 1:
                        # can happen if there are no decompositions
                        for job in self.pasta_team.alignmentjobs:
                            jobq.put(job)
                        new_multilocus_dataset = self.pasta_team.subsets.values()[0].get_results()
                    else:
                        pariwise_tmp_dir_par = os.path.join(curr_tmp_dir_par, "pw")
                        pariwise_tmp_dir_par = self.pasta_team.temp_fs.create_subdir(pariwise_tmp_dir_par)
                        pmj = PASTAMergerJob(multilocus_dataset=self.multilocus_dataset,
                                             pasta_team=self.pasta_team,
                                             tree=subsets_tree,
                                             tmp_base_dir=pariwise_tmp_dir_par,
                                             reset_recursion_index=True,
                                             #delete_temps2=False,
                                             **configuration)

                        pmj.launch_alignment(context_str=context_str)

                        # Start alignment jobs
                        for job in self.pasta_team.alignmentjobs:
                            jobq.put(job)


                        new_multilocus_dataset = pmj.get_results()
                        del pmj

                    self.pasta_team.alignmentjobs = []
                    self.pasta_team.subsets = {}
                else:
                    new_multilocus_dataset = aligner.get_results()

                _LOG.debug("Alignment obtained. Preparing for tree.")
                self.pasta_aligner_job = None
                del aligner

                record_timestamp(os.path.join(curr_tmp_dir_par, 'start_treeinference_timestamp.txt'))
                # Tree inference
                if self.start_tree_search_from_current:
                    start_from = self.tree
                else:
                    start_from = None
                self.status('Step %d. Alignment obtained. Tree inference beginning...' % (self.current_iteration))
                if self.killed:
                    raise RuntimeError("PASTA Job killed")

                tbj = self.pasta_team.tree_estimator.create_job(new_multilocus_dataset,
                                                               starting_tree=start_from,
                                                               num_cpus=self.num_cpus,
                                                               context_str=context_str + " tree",
                                                               tmp_dir_par=curr_tmp_dir_par,
                                                               delete_temps=delete_iteration_temps,
                                                               pasta_products=pasta_products,
                                                               step_num=self.current_iteration,
                                                               mask_gappy_sites = self.mask_gappy_sites)
                prev_curr_align = self.curr_iter_align_tmp_filename
                prev_curr_tree = self.curr_iter_tree_tmp_filename
                self.curr_iter_align_tmp_filename = pasta_products.get_abs_path_for_iter_output(self.current_iteration, TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True)
                self.curr_iter_tree_tmp_filename = pasta_products.get_abs_path_for_iter_output(self.current_iteration, TEMP_TREE_TAG, allow_existing=True)

                self.tree_build_job = tbj
                jobq.put(tbj)
                new_score, new_tree_str = tbj.get_results()
                self.tree_build_job = None
                del tbj
                if self.killed:
                    raise RuntimeError("PASTA Job killed")

                record_timestamp(os.path.join(curr_tmp_dir_par, 'end_treeinference_timestamp.txt'))
                curr_timestamp = time.time()
                accept_iteration = False

                if self.score is None:
                    self.score = new_score

                if self.best_score is None or new_score > self.best_score:
                    self.store_optimum_results(new_multilocus_dataset,
                            new_tree_str,
                            new_score,
                            curr_timestamp)
                    this_iter_score_improved = True
                    accept_iteration = True

                if self._get_accept_mode(new_score=new_score, break_strategy_index=self.break_strategy_index) == AcceptMode.BLIND_MODE:
                    if self.blind_mode_is_final:
                        self.is_stuck_in_blind = True
                        if self.switch_to_blind_timestamp is None:
                            if self._blindmode_trigger:
                                _LOG.debug("Blind runmode trigger = %s" % self._blindmode_trigger)
                            self.switch_to_blind_iter = self.current_iteration
                            self.switch_to_blind_timestamp = curr_timestamp
                    accept_iteration = True

                if accept_iteration:
                    self.score = new_score
                    self.multilocus_dataset = new_multilocus_dataset
                    self.tree_str = new_tree_str
                    if this_iter_score_improved:
                        self.status('realignment accepted and score improved.')
                    else:
                        self.status('realignment accepted and despite the score not improving.')
                    # we do not want to continue to try different breaking strategies for this iteration so we break
                    self.status('current score: %s, best score: %s' % (self.score, self.best_score) )
                    # break
                else:
                    self.status('realignment NOT accepted.')
                    self.curr_iter_align_tmp_filename = prev_curr_align
                    self.curr_iter_tree_tmp_filename = prev_curr_tree

                # break_strategy_index += 1

                # self.status('current score: %s, best score: %s' % (self.score, self.best_score) )

            if not this_iter_score_improved:
                self.num_iter_since_imp += 1
            self.current_iteration += 1

        if self.resumable==False:
            if self._termination_trigger:
                _LOG.debug("Termination trigger = %s" % self._termination_trigger)
            record_timestamp(os.path.join(tmp_dir_par, 'end_pastaiter_timestamp.txt'))

            ### TODO: if configuration is 'return_final_iter_TreeAndAlignpair', then skip the following three lines
            if not self.return_final_tree_and_alignment:
                self.multilocus_dataset = self.best_multilocus_dataset.new_with_shared_meta()
                for locus_alignment in self.best_multilocus_dataset:
                    self.multilocus_dataset.append(copy.copy(locus_alignment))
                self.tree_str = self.best_tree_str
                self.score = self.best_score
            else:
                assert self.multilocus_dataset is not None
                assert self.tree_str is not None
                assert self.score is not None

            return (None, None)
Exemple #8
0
    def launch_alignment(self, tree=None, break_strategy=None, context_str=None):
        '''Puts a alignment job(s) in the queue and then return None
        
        get_results() must be called to get the alignment. Note that this call 
        may not be trivial in terms of time (the tree will be decomposed, lots
        of temporary files may be written...), but the call does not block until
        completion of the alignments.
        Rather it queues the alignment jobs so that multiple processors can be 
        exploited if they are available.
        '''
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")

        if break_strategy is not None:
            self.break_strategy = break_strategy
        break_strategy = self.break_strategy
        if tree is not None:
            self.tree = tree
        self.expected_number_of_taxa = self.multilocus_dataset.get_num_taxa() # for debugging purposes
        self._reset_jobs()
        prefix = "self.multilocus_dataset.get_num_taxa = %d" % self.expected_number_of_taxa
        self.context_str = context_str
        if self.context_str is None:
            self.context_str = ''
        _LOG.debug("Comparing expected_number_of_taxa=%d and max_subproblem_size=%d\n" % (self.expected_number_of_taxa,  self.max_subproblem_size))
        if self.expected_number_of_taxa <= self.max_subproblem_size:
            _LOG.debug("%s...Calling Aligner" % prefix)
            aj_list = []
            for index, single_locus_sd in enumerate(self.multilocus_dataset):
                aj = self.pasta_team.aligner.create_job(single_locus_sd,
                                                       tmp_dir_par=self.tmp_dir_par,
                                                       delete_temps=self.delete_temps,
                                                       context_str=self.context_str + " align" + str(index))                
                aj.add_parent_tickable_job(self)
                self.add_child(aj)
                
                aj_list.append(aj)
                if self.killed:
                    raise RuntimeError("PastaAligner Job killed")
                
                self.pasta_team.alignmentjobs.append(aj)
            
            self.align_job_list = aj_list
            
            if self.skip_merge:
                for taxa in self.tree.leaf_node_names():
                    self.pasta_team.subsets[taxa]=self
            else:
                for aj in aj_list:
                    jobq.put(aj)
        else:
            _LOG.debug("%s...Recursing" % prefix)
            # create the subjobs
            self.subjob1, self.subjob2 = self.bipartition_by_tree(break_strategy)
            # store this dir so we can use it in the merger
            if self.killed:
                raise RuntimeError("PastaAligner Job killed")

            self.subjob1.add_parent(self)
            self.subjob2.add_parent(self)
            self.add_child(self.subjob1)
            self.add_child(self.subjob2)

            self.subjob1.launch_alignment(break_strategy=break_strategy)
            if self.killed:
                raise RuntimeError("PastaAligner Job killed")
            self.subjob2.launch_alignment(break_strategy=break_strategy)
            if self.killed:
                raise RuntimeError("PastaAligner Job killed")
        return
Exemple #9
0
 def run(self, *args, **kwargs):
     start_worker(1)
     job = self.create_job(*args, **kwargs)
     jobq.put(job)
     return job.get_results()