Python Alignment.write Exemples, alignment.Alignment.write Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : analysis.py Projet : roblanf/partitionfinder

class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, rpt, 
                 force_restart=False, 
                 save_phyml=False,
                 threads=-1):
        cfg.validate()

        self.cfg = cfg
        self.rpt = rpt
        self.threads = threads
        self.save_phyml = save_phyml
        self.results = results.AnalysisResults()

        log.info("Beginning Analysis")
        if force_restart:
            # Remove everything
            if os.path.exists(self.cfg.output_path):
                log.warning("Deleting all previous workings in '%s'", 
                            self.cfg.output_path)
                shutil.rmtree(self.cfg.output_path)
        else:
            # Just remove the schemes folder
            if os.path.exists(self.cfg.schemes_path):
                log.info("Removing Schemes in '%s' (they will be "
                         "recalculated from existing subset data)",
                         self.cfg.schemes_path)
                shutil.rmtree(self.cfg.schemes_path)

        #check for old analyses to see if we can use the old data
        self.cfg.check_for_old_config()

        # Make some folders for the analysis
        self.cfg.make_output_folders()
        self.make_alignment(cfg.alignment_path)

        self.make_tree(cfg.user_tree_topology_path)
        self.subsets_analysed_set = set() #a counter for user info
        self.subsets_analysed = 0 #a counter for user info
        self.total_subset_num = None
        self.schemes_analysed = 0 #a counter for user info
        self.total_scheme_num = None

    def analyse(self):
        self.do_analysis()
        self.results.finalise()
        self.report()
        return self.results

    def report(self):
        best = [
            ("Best scheme according to AIC", self.results.best_aic),
            ("Best scheme according to AICc", self.results.best_aicc),
            ("Best scheme according to BIC", self.results.best_bic),
        ]
        self.rpt.write_best_schemes(best)
        self.rpt.write_all_schemes(self.results)

    def make_alignment(self, source_alignment_path):
        # Make the alignment 
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("Alignment file has changed since previous run. "
                          "You need to use the force-restart option.")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset.Subset(*list(self.cfg.partitions))
        self.filtered_alignment = SubsetAlignment(self.alignment, 
                                                  subset_with_everything)
        self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path,
                                                    'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Now we've written this alignment, we need to lock everything in
        # place, no more adding partitions, or changing them from now on.
        self.cfg.partitions.check_against_alignment(self.alignment)
        self.cfg.partitions.finalise()

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = phyml.make_tree_path(self.filtered_alignment_path)
        if not os.path.exists(tree_path):
            # If we have a user tree, then use that, otherwise, create a topology
            if user_path != None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s", user_path)
                topology_path = os.path.join(self.cfg.start_tree_path,
                                             'user_topology.phy')
                phyml.dupfile(user_path, topology_path)
            else:
                topology_path = phyml.make_topology(self.filtered_alignment_path, self.cfg.datatype)

            # Now estimate branch lengths
            if self.cfg.datatype == "DNA":
                tree_path = phyml.make_branch_lengths(self.filtered_alignment_path, topology_path)
            elif self.cfg.datatype == "protein":
                tree_path = phyml.make_branch_lengths_protein(self.filtered_alignment_path, topology_path)
                
        self.tree_path = tree_path
        log.info("Starting tree with branch lengths is here: %s", self.tree_path) 


    def analyse_subset(self, sub, models):
        """Analyse the subset using the models given
        This is the core place where everything comes together
        The results are placed into subset.result
        """

        log.debug("About to analyse %s using models %s", sub, ", ".join(list(models)))

        #keep people informed about what's going on
        #if we don't know the total subset number, we can usually get it like this
        if self.total_subset_num == None:
            self.total_subset_num = len(sub._cache)
        old_num_analysed = self.subsets_analysed
        self.subsets_analysed_set.add(sub.name)
        self.subsets_analysed = len(self.subsets_analysed_set)
        if self.subsets_analysed>old_num_analysed: #we've just analysed a subset we haven't seen yet
            percent_done = float(self.subsets_analysed)*100.0/float(self.total_subset_num)
            log.info("Analysing subset %d/%d: %.2f%s done" %(self.subsets_analysed,self.total_subset_num, percent_done, r"%"))

        subset_cache_path = os.path.join(self.cfg.subsets_path, sub.name + '.bin')
        # We might have already saved a bunch of results, try there first
        if not sub.results:
            log.debug("Reading in cached data from the subsets file")
            sub.read_cache(subset_cache_path)

        # First, see if we've already got the results loaded. Then we can
        # shortcut all the other checks
        models_done = set(sub.results.keys())
        log.debug("These models have already been done: %s", models_done)
        models_required = set(models)
        models_to_do = models_required - models_done
        log.debug("Which leaves these models still to analyse: %s", models_to_do)

        

        
        # Empty set means we're done
        if not models_to_do:
            log.debug("All models already done, so using just the cached results for subset %s", sub)
            #if models_done!=set(models): #redo model selection if we have different models
            sub.model_selection(self.cfg.model_selection, self.cfg.models)        
            return


        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(self.alignment, sub)
        sub_path = os.path.join(self.cfg.phyml_path, sub.name + '.phy')
        # Add it into the sub, so we keep it around
        sub.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s", sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error("It looks like you have changed one or more of the"
                        "data_blocks in the configuration file, "
                        "so the new subset alignments"
                        " don't match the ones stored for this analysis."
                        "You'll need to run the program with --force-restart")
                raise AnalysisError
        else:
            # We need to write it
            sub_alignment.write(sub_path)

        # Try and read in some previous analyses
        log.debug("Checking for old results in the phyml folder")
        self.parse_results(sub, models_to_do)
        if not models_to_do:
            #if models_done!=set(models): #redo model selection if we have different models
            sub.model_selection(self.cfg.model_selection, self.cfg.models)        
            return

        # What is left, we actually have to analyse...
        tasks = []

        #for efficiency, we rank the models by their difficulty - most difficult first
        difficulty = []        
        for m in models_to_do:
            difficulty.append(get_model_difficulty(m))
        
        #hat tip to http://scienceoss.com/sort-one-list-by-another-list/
        difficulty_and_m = zip(difficulty, models_to_do)
        difficulty_and_m.sort(reverse=True)
        sorted_difficulty, sorted_models_to_do = zip(*difficulty_and_m)
            
        log.debug("About to analyse these models, in this order: %s", sorted_models_to_do)
        for m in sorted_models_to_do:
            #a_path, out_path = phyml.make_analysis_path(self.cfg.phyml_path, sub.name, m)
            tasks.append((phyml.analyse, 
                          (m, sub_path, self.tree_path, self.cfg.branchlengths)))

        if self.threads == 1:
            self.run_models_concurrent(tasks)
        else:
            self.run_models_threaded(tasks)

        # Now parse the models we've just done
        self.parse_results(sub, models_to_do)

        # This should be empty NOW!
        if models_to_do:
            log.error("Failed to run models %s; not sure why", 
                      ", ".join(list(models_to_do)))
            raise AnalysisError

        # Now we have analysed all models for this subset, we do model selection
        # but ONLY on the models specified in the cfg file.
        sub.model_selection(self.cfg.model_selection, self.cfg.models)        
        
        # If we made it to here, we should write out the new summary
        self.rpt.write_subset_summary(sub)
        # We also need to update this
        sub.write_cache(subset_cache_path)

    def parse_results(self, sub, models_to_do):
        """Read in the results and parse them"""
        models_done = []
        for m in list(models_to_do):
            # sub.alignment_path
            stats_path, tree_path = phyml.make_output_path(sub.alignment_path, m)
            if os.path.exists(stats_path):
                sub_output = open(stats_path, 'rb').read()
                # Annotate with the parameters of the model
                try:
                    result = phyml.parse(sub_output)
                    sub.add_model_result(m, result)
                    # Remove the current model from remaining ones
                    models_to_do.remove(m)
                    
                    # Just used for below
                    models_done.append(m)
                    if self.save_phyml:
                        pass
                    else:
                        os.remove(stats_path)
                        os.remove(tree_path)

                except phyml.PhymlError:
                    log.warning("Failed loading parse output from %s."
                              "Output maybe corrupted. I'll run it again.",
                              stats_path)

        if models_done:
            log.debug("Loaded analysis for %s, models %s", sub, ", ".join(models_done))

    def run_models_concurrent(self, tasks):
        for func, args in tasks:
            func(*args)

    def run_models_threaded(self, tasks):
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_scheme(self, sch, models):
        self.schemes_analysed = self.schemes_analysed + 1        
        log.info("Analysing scheme %d/%d" %(self.schemes_analysed, self.total_scheme_num))
        for sub in sch:
            self.analyse_subset(sub, models)
 
        # AIC needs the number of sequences 
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq, self.cfg.branchlengths)
        self.results.add_scheme_result(result)

        # TODO: should put all paths into config. Then reporter should decide
        # whether to create stuff
        fname = os.path.join(self.cfg.schemes_path, sch.name+'.txt')
        self.rpt.write_scheme_summary(result, open(fname, 'w'))

        return result

Exemple #2

0

Afficher le fichier

Fichier : analysis.py Projet : wangdang511/partitionfinder

class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, force_restart, threads):
        the_config.validate()

        # TODO: Remove -- put this all into "options"
        if threads == -1:
            threads = threadpool.get_cpu_count()

        self.threads = threads

        # TODO: Move these to the config validate and prepare
        log.info("Beginning Analysis")
        self.process_restart(force_restart)

        # Make some folders for the analysis
        the_config.make_output_folders()
        the_config.database = Database(the_config)

        # Check for old analyses to see if we can use the old data
        the_config.check_for_old_config()

        # TODO: This is going to be in "Prepare"
        self.make_alignment(cfg.alignment_path)
        self.make_tree(cfg.user_tree_topology_path)

        # We need this to block the threads for critical stuff
        self.lock = threading.Condition(threading.Lock())

        # Store the result in here
        self.results = results.AnalysisResults(the_config.model_selection)

    def process_restart(self, force_restart):
        if force_restart:
            # Remove everything
            if os.path.exists(the_config.output_path):
                log.warning("Deleting all previous workings in '%s'" %
                            the_config.output_path)
                shutil.rmtree(the_config.output_path)
        else:
            # Remove the schemes folder, and clean out the phylofiles folder
            if os.path.exists(the_config.schemes_path):
                log.debug("Removing files in '%s'" % the_config.schemes_path)
                shutil.rmtree(the_config.schemes_path)
            if os.path.exists(the_config.phylofiles_path):
                log.debug("Removing files in '%s'" %
                          the_config.phylofiles_path)
                shutil.rmtree(the_config.phylofiles_path)

    def analyse(self):
        try:
            self.do_analysis()
        finally:
            # TODO: Not really the right place for it?
            the_config.database.close()
        return self.results

    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # TODO REMOVE -- this should be part of the checking procedure
        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path,
                                           'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("""Alignment file has changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

            compare = lambda x, y: collections.Counter(
                x) == collections.Counter(y)

            if not compare(old_align.species, self.alignment.species):
                log.error(
                    """Species names in alignment have changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)

    def need_new_tree(self, tree_path):
        if os.path.exists(tree_path):
            if ';' in open(tree_path).read():
                log.info("Starting tree file found.")
                redo_tree = False
            else:
                log.info("""Starting tree file found but it is incomplete.
                             Re-estimating""")
                redo_tree = True
        else:
            log.info("Starting tree will be estimated from the data.")
            redo_tree = True

        return redo_tree

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(
            the_config.user_subsets)
        self.filtered_alignment = SubsetAlignment(self.alignment,
                                                  subset_with_everything)
        self.filtered_alignment_path = os.path.join(the_config.start_tree_path,
                                                    'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Check the full subset against the alignment
        subset_ops.check_against_alignment(subset_with_everything,
                                           self.alignment, the_config)

        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path,
                                           'source.phy')

        # Now check for the tree
        tree_path = the_config.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(the_config.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s" % user_path)
                topology_path = os.path.join(the_config.start_tree_path,
                                             'user_topology.phy')
                util.dupfile(user_path, topology_path)
                need_bl = True
            elif the_config.no_ml_tree == True:
                log.debug("didn't find tree at %s, making a new one" %
                          tree_path)
                topology_path = the_config.processor.make_topology(
                    self.filtered_alignment_path, the_config.datatype,
                    the_config.cmdline_extras)
                need_bl = True
            elif the_config.no_ml_tree == False:
                log.debug(
                    "didn't find tree at %s, making an ML tree with RAxML" %
                    tree_path)

                tree_scheme = scheme.create_scheme(
                    the_config, "tree_scheme",
                    range(len(the_config.user_subsets)))

                topology_path = raxml.make_ml_topology(
                    self.filtered_alignment_path, the_config.datatype,
                    the_config.cmdline_extras, tree_scheme, self.threads)

                # here we copy the ML tree topology so it can be used with PhyML too
                # TODO: this is a hack, and it would be better to decide on a universal
                # name for the different types of tree we might have.
                phyml_tree = os.path.join(
                    os.path.dirname(topology_path),
                    "filtered_source.phy_phyml_tree.txt")
                copyfile(topology_path, phyml_tree)

                need_bl = False

            if need_bl == True:
                # Now estimate branch lengths
                tree_path = the_config.processor.make_branch_lengths(
                    self.filtered_alignment_path, topology_path,
                    the_config.datatype, the_config.cmdline_extras)

        self.tree_path = tree_path
        log.debug("Starting tree with branch lengths is here: %s" %
                  self.tree_path)

    def run_task(self, model_name, sub):
        # This bit should run in parallel (forking the processor)
        try:
            the_config.processor.analyse(model_name, sub.alignment_path,
                                         self.tree_path,
                                         the_config.branchlengths,
                                         the_config.cmdline_extras)
            fabricate = False
        except ExternalProgramError:
            if not the_config.suppress_errors:
                # In the Kmeans algorithm we suppress errors and "fabricate"
                # subsets (we assume the error is because the subset is too
                # small for analysis)
                raise

            # If it is kmeans we assume that the error is because the subset
            # is too small or unanalysable, so we fabricate it
            log.debug("New subset could not be analysed. It will be merged "
                      "at the end of the analysis")
            fabricate = True

        # Not entirely sure that WE NEED to block here, but it is safer to do
        # It shouldn't hold things up toooo long...
        self.lock.acquire()
        try:
            if fabricate:
                sub.fabricate_model_result(the_config, model_name)
            else:
                sub.parse_model_result(the_config, model_name)

            # Try finalising, then the result will get written out earlier...
            sub.finalise(the_config)
        finally:
            self.lock.release()

    def add_tasks_for_sub(self, tasks, sub):
        for m in sub.models_to_process:
            tasks.append((self.run_task, (m, sub)))

    def run_concurrent(self, tasks):
        for func, args in tasks:
            log.debug("About to analyse subset %s", args[1].name)
            func(*args)

    def run_threaded(self, tasks):
        if not tasks:
            return
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_list_of_subsets(
        self,
        all_subsets,
    ):
        # get a whole list of subsets analysed in parallel

        # analyse bigger subsets first, for efficiency
        all_subsets.sort(key=lambda x: 1.0 / float(len(x.columns)))

        # chunk the list into blocks of ~1000 tasks
        # in empirical testing, this speeds things up lot
        # though we are not entirely sure why...
        n = 1000
        n = int(n / len(the_config.models))
        if (n < 1): n = 1  # seems unlikely...

        log.debug("chunk size (in number of subsets) = %d", n)

        subset_chunks = [
            all_subsets[i:i + n] for i in xrange(0, len(all_subsets), n)
        ]

        for subsets in subset_chunks:
            # prepare the list of tasks
            tasks = []
            for sub in subsets:
                if sub.is_done:
                    pass
                elif sub.is_prepared:
                    self.add_tasks_for_sub(tasks, sub)
                else:
                    sub.prepare(the_config, self.alignment)
                    self.add_tasks_for_sub(tasks, sub)
            if tasks:
                # Now do the analysis
                if self.threads == 1:
                    self.run_concurrent(tasks)
                else:
                    self.run_threaded(tasks)

        # Now see if we're done
        for sub in all_subsets:
            # ALL subsets should already be finalised in the task. We just
            # check again here
            if not sub.finalise(the_config):
                log.error("Failed to run models %s; not sure why" % ", "
                          "".join(list(sub.models_not_done)))
                raise AnalysisError

    def analyse_scheme(self, sch):
        # Progress
        the_config.progress.next_scheme()

        # analyse the subsets in the scheme that aren't done
        # NB for most schemes we will have all subsets done, so this saves time
        not_done = []
        for sub in sch:
            if sub.is_done == False:
                not_done.append(sub)
        if not_done:
            self.analyse_list_of_subsets(not_done)

        # AIC needs the number of sequences
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq,
                                     the_config.branchlengths,
                                     the_config.model_selection)
        self.results.add_scheme_result(sch, result)

        return result

Exemple #3

0

Afficher le fichier

class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, force_restart=False, threads=-1):
        cfg.validate()
        self.cfg = cfg
        self.threads = threads

        self.results = results.AnalysisResults(self.cfg.model_selection)

        log.info("Beginning Analysis")
        self.process_restart(force_restart)

        # Check for old analyses to see if we can use the old data
        self.cfg.check_for_old_config()

        # Make some folders for the analysis
        self.cfg.make_output_folders()
        self.make_alignment(cfg.alignment_path)
        self.make_tree(cfg.user_tree_topology_path)

        # We need this to block the threads for critical stuff
        self.lock = threading.Condition(threading.Lock())

    def process_restart(self, force_restart):
        if force_restart:
            # Remove everything
            if os.path.exists(self.cfg.output_path):
                log.warning("Deleting all previous workings in '%s'",
                            self.cfg.output_path)
                shutil.rmtree(self.cfg.output_path)
        else:
            # Just remove the schemes folder
            if os.path.exists(self.cfg.schemes_path):
                log.info(
                    "Removing Schemes in '%s' (they will be recalculated from existing subset data)",
                    self.cfg.schemes_path)
                shutil.rmtree(self.cfg.schemes_path)

    def analyse(self):
        self.do_analysis()
        return self.results

    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path,
                                           'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error(
                    "Alignment file has changed since previous run. You need to use the force-restart option."
                )
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)

    def need_new_tree(self, tree_path):
        if os.path.exists(tree_path):
            if ';' in open(tree_path).read():
                log.info("Starting tree file found.")
                redo_tree = False
            else:
                log.info(
                    "Starting tree file found but incomplete. Re-estimating")
                redo_tree = True
        else:
            log.info("No starting tree file found.")
            redo_tree = True

        return redo_tree

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset.Subset(*list(self.cfg.partitions))
        self.filtered_alignment = SubsetAlignment(self.alignment,
                                                  subset_with_everything)
        self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path,
                                                    'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Now we've written this alignment, we need to lock everything in
        # place, no more adding partitions, or changing them from now on.
        self.cfg.partitions.check_against_alignment(self.alignment)
        self.cfg.partitions.finalise()

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path,
                                           'source.phy')

        # Now check for the tree
        tree_path = self.cfg.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path) == True:
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(self.cfg.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s", user_path)
                topology_path = os.path.join(self.cfg.start_tree_path,
                                             'user_topology.phy')
                self.cfg.processor.dupfile(user_path, topology_path)
            else:
                log.debug("didn't find tree at %s, making a new one" %
                          tree_path)
                topology_path = self.cfg.processor.make_topology(
                    self.filtered_alignment_path, self.cfg.datatype,
                    self.cfg.cmdline_extras)

            # Now estimate branch lengths
            tree_path = self.cfg.processor.make_branch_lengths(
                self.filtered_alignment_path, topology_path, self.cfg.datatype,
                self.cfg.cmdline_extras)

        self.tree_path = tree_path
        log.info("Starting tree with branch lengths is here: %s",
                 self.tree_path)

    def run_task(self, m, sub):
        # This bit should run in parallel (forking the processor)
        self.cfg.processor.analyse(m, sub.alignment_path, self.tree_path,
                                   self.cfg.branchlengths,
                                   self.cfg.cmdline_extras)

        # Not entirely sure that WE NEED to block here, but it is safer to do
        # It shouldn't hold things up toooo long...
        self.lock.acquire()
        try:
            sub.parse_model_result(self.cfg, m)
            # Try finalising, then the result will get written out earlier...
            sub.finalise(self.cfg)
        finally:
            self.lock.release()

    def add_tasks_for_sub(self, tasks, sub):
        for m in sub.models_to_process:
            tasks.append((self.run_task, (m, sub)))

    def run_concurrent(self, tasks):
        for func, args in tasks:
            func(*args)

    def run_threaded(self, tasks):
        if not tasks:
            return
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_scheme(self, sch):
        # Progress
        self.cfg.progress.next_scheme()

        # Prepare by reading everything in first
        tasks = []
        for sub in sch:
            sub.prepare(self.cfg, self.alignment)
            self.add_tasks_for_sub(tasks, sub)

        # Now do the analysis
        if self.threads == 1:
            self.run_concurrent(tasks)
        else:
            self.run_threaded(tasks)

        # Now see if we're done
        for sub in sch:
            # ALL subsets should already be finalised in the task. We just
            # check again here
            if not sub.finalise(self.cfg):
                log.error("Failed to run models %s; not sure why",
                          ", ".join(list(sub.models_to_do)))
                raise AnalysisError

        # AIC needs the number of sequences
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq,
                                     self.cfg.branchlengths,
                                     self.cfg.model_selection)
        self.results.add_scheme_result(sch, result)

        return result

Exemple #4

0

Afficher le fichier

Fichier : analysis.py Projet : LaureneAlicia/partitionfinder

class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, force_restart=False, threads=-1):
        cfg.validate()
        self.cfg = cfg
        self.threads = threads

        self.results = results.AnalysisResults(self.cfg.model_selection)

        log.info("Beginning Analysis")
        self.process_restart(force_restart)

        # Check for old analyses to see if we can use the old data
        self.cfg.check_for_old_config()

        # Make some folders for the analysis
        self.cfg.make_output_folders()
        self.make_alignment(cfg.alignment_path)
        self.make_tree(cfg.user_tree_topology_path)

        # We need this to block the threads for critical stuff
        self.lock = threading.Condition(threading.Lock())

    def process_restart(self, force_restart):
        if force_restart:
            # Remove everything
            if os.path.exists(self.cfg.output_path):
                log.warning("Deleting all previous workings in '%s'", self.cfg.output_path)
                shutil.rmtree(self.cfg.output_path)
        else:
            # Just remove the schemes folder
            if os.path.exists(self.cfg.schemes_path):
                log.info("Removing Schemes in '%s' (they will be recalculated from existing subset data)", self.cfg.schemes_path)
                shutil.rmtree(self.cfg.schemes_path)

    def analyse(self):
        self.do_analysis()
        return self.results

    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("Alignment file has changed since previous run. You need to use the force-restart option.")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)

    def need_new_tree(self, tree_path):
        if os.path.exists(tree_path):
            if ';' in open(tree_path).read():
                log.info("Starting tree file found.")
                redo_tree = False
            else: 
                log.info("Starting tree file found but incomplete. Re-estimating")
                redo_tree = True
        else:
            log.info("No starting tree file found.")
            redo_tree = True
        
        return redo_tree

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset.Subset(*list(self.cfg.partitions))
        self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything)
        self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path, 'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Now we've written this alignment, we need to lock everything in
        # place, no more adding partitions, or changing them from now on.
        self.cfg.partitions.check_against_alignment(self.alignment)
        self.cfg.partitions.finalise()

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = self.cfg.processor.make_tree_path(self.filtered_alignment_path)

        if self.need_new_tree(tree_path) == True:
            log.debug("Estimating new starting tree, no old tree found")
            
            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(self.cfg.start_tree_path, keep = ["filtered_source.phy", "source.phy"])
            
            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s", user_path)
                topology_path = os.path.join(self.cfg.start_tree_path, 'user_topology.phy')
                self.cfg.processor.dupfile(user_path, topology_path)
            else:
                log.debug(
                    "didn't find tree at %s, making a new one" % tree_path)
                topology_path = self.cfg.processor.make_topology(
                    self.filtered_alignment_path, self.cfg.datatype, self.cfg.cmdline_extras)

            # Now estimate branch lengths
            tree_path = self.cfg.processor.make_branch_lengths(
                self.filtered_alignment_path,
                topology_path,
                self.cfg.datatype,
                self.cfg.cmdline_extras)

        self.tree_path = tree_path
        log.info("Starting tree with branch lengths is here: %s", self.tree_path)

    def run_task(self, m, sub):
        # This bit should run in parallel (forking the processor)
        self.cfg.processor.analyse(
            m,
            sub.alignment_path,
            self.tree_path,
            self.cfg.branchlengths,
            self.cfg.cmdline_extras
        )

        # Not entirely sure that WE NEED to block here, but it is safer to do
        # It shouldn't hold things up toooo long...
        self.lock.acquire()
        try:
            sub.parse_model_result(self.cfg, m)
            # Try finalising, then the result will get written out earlier...
            sub.finalise(self.cfg)
        finally:
            self.lock.release()

    def add_tasks_for_sub(self, tasks, sub):
        for m in sub.models_to_process:
            tasks.append((self.run_task, (m, sub)))

    def run_concurrent(self, tasks):
        for func, args in tasks:
            func(*args)

    def run_threaded(self, tasks):
        if not tasks:
            return
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_scheme(self, sch):
        # Progress
        self.cfg.progress.next_scheme()

        # Prepare by reading everything in first
        tasks = []
        for sub in sch:
            sub.prepare(self.cfg, self.alignment)
            self.add_tasks_for_sub(tasks, sub)

        # Now do the analysis
        if self.threads == 1:
            self.run_concurrent(tasks)
        else:
            self.run_threaded(tasks)

        # Now see if we're done
        for sub in sch:
            # ALL subsets should already be finalised in the task. We just
            # check again here
            if not sub.finalise(self.cfg):
                log.error("Failed to run models %s; not sure why", ", ".join(list(sub.models_to_do)))
                raise AnalysisError

        # AIC needs the number of sequences
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq, self.cfg.branchlengths, self.cfg.model_selection)
        self.results.add_scheme_result(sch, result)

        return result

Exemple #5

0

Afficher le fichier

Fichier : analysis.py Projet : brettc/partitionfinder

class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, force_restart, threads):
        the_config.validate()

        # TODO: Remove -- put this all into "options"
        if threads == -1:
            threads = threadpool.get_cpu_count()

        self.threads = threads

        # TODO: Move these to the config validate and prepare
        log.info("Beginning Analysis")
        self.process_restart(force_restart)

        # Make some folders for the analysis
        the_config.make_output_folders()
        the_config.database = Database(the_config)

        # Check for old analyses to see if we can use the old data
        the_config.check_for_old_config()

        # TODO: This is going to be in "Prepare"
        self.make_alignment(cfg.alignment_path)
        self.make_tree(cfg.user_tree_topology_path)

        # We need this to block the threads for critical stuff
        self.lock = threading.Condition(threading.Lock())

        # Store the result in here
        self.results = results.AnalysisResults(the_config.model_selection)

    def process_restart(self, force_restart):
        if force_restart:
            # Remove everything
            if os.path.exists(the_config.output_path):
                log.warning("Deleting all previous workings in '%s'" %
                            the_config.output_path)
                shutil.rmtree(the_config.output_path)
        else:
            # Remove the schemes folder, and clean out the phylofiles folder
            if os.path.exists(the_config.schemes_path):
                log.debug("Removing files in '%s'" % the_config.schemes_path)
                shutil.rmtree(the_config.schemes_path)
            if os.path.exists(the_config.phylofiles_path):
                log.debug("Removing files in '%s'" % the_config.phylofiles_path)
                shutil.rmtree(the_config.phylofiles_path)


    def analyse(self):
        try:
            self.do_analysis()
        finally:
            # TODO: Not really the right place for it?
            the_config.database.close()
        return self.results



    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # TODO REMOVE -- this should be part of the checking procedure
        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("""Alignment file has changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

            compare = lambda x, y: collections.Counter(x) == collections.Counter(y)

            if not compare(old_align.species, self.alignment.species):
                log.error("""Species names in alignment have changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError


        else:
            self.alignment.write(self.alignment_path)

    def need_new_tree(self, tree_path):
        if os.path.exists(tree_path):
            if ';' in open(tree_path).read():
                log.info("Starting tree file found.")
                redo_tree = False
            else:
                log.info("""Starting tree file found but it is incomplete.
                             Re-estimating""")
                redo_tree = True
        else:
            log.info("Starting tree will be estimated from the data.")
            redo_tree = True

        return redo_tree

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(the_config.user_subsets)
        self.filtered_alignment = SubsetAlignment(
            self.alignment, subset_with_everything)
        self.filtered_alignment_path = os.path.join(
            the_config.start_tree_path,  'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Check the full subset against the alignment
        subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config)

        # We start by copying the alignment
        self.alignment_path = os.path.join(
            the_config.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = the_config.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(the_config.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s" % user_path)
                topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy')
                util.dupfile(user_path, topology_path)
                need_bl = True
            elif the_config.no_ml_tree == True:
                log.debug(
                    "didn't find tree at %s, making a new one" % tree_path)
                topology_path = the_config.processor.make_topology(
                    self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras)
                need_bl = True
            elif the_config.no_ml_tree == False:
                log.debug(
                    "didn't find tree at %s, making an ML tree with RAxML" % tree_path)

                tree_scheme = scheme.create_scheme(
                    the_config, "tree_scheme", range(len(the_config.user_subsets)))

                topology_path = raxml.make_ml_topology(
                    self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads)
                
                # here we copy the ML tree topology so it can be used with PhyML too
                # TODO: this is a hack, and it would be better to decide on a universal
                # name for the different types of tree we might have.
                phyml_tree = os.path.join(os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt")
                copyfile(topology_path, phyml_tree)

                need_bl = False

            if need_bl == True:
                # Now estimate branch lengths
                tree_path = the_config.processor.make_branch_lengths(
                    self.filtered_alignment_path,
                    topology_path,
                    the_config.datatype,
                    the_config.cmdline_extras)

        self.tree_path = tree_path
        log.debug("Starting tree with branch lengths is here: %s" %
                 self.tree_path)

    def run_task(self, model_name, sub):
        # This bit should run in parallel (forking the processor)
        try:
            the_config.processor.analyse(
                model_name,
                sub.alignment_path,
                self.tree_path,
                the_config.branchlengths,
                the_config.cmdline_extras
            )
            fabricate = False
        except ExternalProgramError:
            if not the_config.suppress_errors:
                # In the Kmeans algorithm we suppress errors and "fabricate"
                # subsets (we assume the error is because the subset is too
                # small for analysis)
                raise

            # If it is kmeans we assume that the error is because the subset
            # is too small or unanalysable, so we fabricate it
            log.debug("New subset could not be analysed. It will be merged "
                        "at the end of the analysis")
            fabricate = True

        # Not entirely sure that WE NEED to block here, but it is safer to do
        # It shouldn't hold things up toooo long...
        self.lock.acquire()
        try:
            if fabricate:
                sub.fabricate_model_result(the_config, model_name)
            else:
                sub.parse_model_result(the_config, model_name)

            # Try finalising, then the result will get written out earlier...
            sub.finalise(the_config)
        finally:
            self.lock.release()

    def add_tasks_for_sub(self, tasks, sub):
        for m in sub.models_to_process:
            tasks.append((self.run_task, (m, sub)))

    def run_concurrent(self, tasks):
        for func, args in tasks:
            log.debug("About to analyse subset %s", args[1].name)
            func(*args)

    def run_threaded(self, tasks):
        if not tasks:
            return
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_list_of_subsets(self, all_subsets, ):
        # get a whole list of subsets analysed in parallel

        # analyse bigger subsets first, for efficiency
        all_subsets.sort(key = lambda x: 1.0/float(len(x.columns)))

        # chunk the list into blocks of ~1000 tasks 
        # in empirical testing, this speeds things up lot
        # though we are not entirely sure why...
        n = 1000        
        n = int(n / len(the_config.models))
        if(n<1): n=1 # seems unlikely...

        log.debug("chunk size (in number of subsets) = %d", n)

        subset_chunks = [all_subsets[i:i + n] for i in xrange(0, len(all_subsets), n)]
        
        for subsets in subset_chunks:
            # prepare the list of tasks
            tasks = []
            for sub in subsets:
                if sub.is_done:
                    pass
                elif sub.is_prepared:
                    self.add_tasks_for_sub(tasks, sub)
                else:
                    sub.prepare(the_config, self.alignment)
                    self.add_tasks_for_sub(tasks, sub)
            if tasks:
                # Now do the analysis
                if self.threads == 1:
                    self.run_concurrent(tasks)
                else:
                    self.run_threaded(tasks)

        # Now see if we're done
        for sub in all_subsets:
            # ALL subsets should already be finalised in the task. We just
            # check again here
            if not sub.finalise(the_config):
                log.error("Failed to run models %s; not sure why" %
                          ", " "".join(list(sub.models_not_done)))
                raise AnalysisError

    def analyse_scheme(self, sch):
        # Progress
        the_config.progress.next_scheme()

        # analyse the subsets in the scheme that aren't done
        # NB for most schemes we will have all subsets done, so this saves time
        not_done = []
        for sub in sch:
            if sub.is_done == False:
                not_done.append(sub)
        if not_done:
            self.analyse_list_of_subsets(not_done)

        # AIC needs the number of sequences
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq, the_config.branchlengths, the_config.model_selection)
        self.results.add_scheme_result(sch, result)

        return result