Example #1
0
    def _producer(self, replicated_num):
        """Infer tree from bootstrapped multiple sequence alignment.

        Parameters
        ----------
        replicated_num : int
          Unique replicate number.
        """

        output_msa = os.path.join(
            self.replicate_dir,
            'bootstrap_msa.r_' + str(replicated_num) + '.fna')
        if os.path.exists(output_msa) and os.path.getsize(output_msa) > 0:
            self.logger.warning(
                'Skipping {} as it already exists.'.format(output_msa))
            return True

        output_tree = os.path.join(
            self.replicate_dir,
            'bootstrap_tree.r_' + str(replicated_num) + '.tree')
        fast_tree_output = os.path.join(
            self.replicate_dir,
            'bootstrap_fasttree.r_' + str(replicated_num) + '.out')
        if os.path.exists(
                fast_tree_output) and os.path.getsize(fast_tree_output) > 0:
            self.logger.warning(
                'Skipping {} as it already exists.'.format(fast_tree_output))
            return True

        bootstrap_alignment(self.msa, output_msa, frac=self.frac)
        fast_tree = FastTree(multithreaded=False)
        cmd = fast_tree.run(output_msa, self.base_type, self.model, self.gamma,
                            output_tree, fast_tree_output)

        return True
Example #2
0
    def _producer(self, replicated_num):
        """Infer tree from bootstrapped multiple sequence alignment.

        Parameters
        ----------
        replicated_num : int
          Unique replicate number.
        """

        output_msa = os.path.join(
            self.replicate_dir,
            'bootstrap_msa.r_' + str(replicated_num) + '.fna')
        bootstrap_alignment(self.msa, output_msa, frac=self.frac)

        fast_tree = FastTree(multithreaded=False)
        output_tree = os.path.join(
            self.replicate_dir,
            'bootstrap_tree.r_' + str(replicated_num) + '.tree')
        fast_tree_output = os.path.join(
            self.replicate_dir,
            'bootstrap_fasttree.r_' + str(replicated_num) + '.out')
        fast_tree.run(output_msa, self.base_type, self.model, output_tree,
                      fast_tree_output)

        return True
    def _producer(self, replicated_num):
        """Infer tree from jackknifed alignments.

        Parameters
        ----------
        replicated_num : int
          Unique replicate number.
        """

        output_msa = os.path.join(
            self.replicate_dir,
            'jk_markers.msa.' + str(replicated_num) + '.faa')
        self.jackknife_alignment(self.msa, self.perc_markers_to_keep,
                                 self.marker_lengths, output_msa)

        fast_tree = FastTree(multithreaded=False)
        output_tree = os.path.join(
            self.replicate_dir,
            'jk_markers.tree.' + str(replicated_num) + '.tre')
        fast_tree_output = os.path.join(
            self.replicate_dir,
            'jk_markers.fasttree.' + str(replicated_num) + '.out')
        fast_tree.run(output_msa, 'prot', self.model, output_tree,
                      fast_tree_output)

        return True
Example #4
0
    def infer(self, options):
        """Infer tree from MSA."""

        self.logger.warning("Tree inference is still under development!")

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if (options.cpus > 1):
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])

        self.logger.info('Inferring tree with FastTree using %s+GAMMA.' %
                         options.prot_model)
        fasttree = FastTree(multithreaded=(options.cpus > 1))

        tree_unrooted_output = os.path.join(
            options.out_dir,
            options.prefix + options.suffix + '.unrooted.tree')
        tree_log = os.path.join(options.out_dir, options.prefix + '.tree.log')
        tree_output_log = os.path.join(options.out_dir, 'fasttree.log')
        fasttree.run(options.msa_file, 'prot', options.prot_model,
                     tree_unrooted_output, tree_log, tree_output_log)

        self.logger.info('Done.')
Example #5
0
    def infer_gene_trees(self, msa_dir, output_dir, extension):
        """Infer gene trees.

        Parameters
        ----------
        msa_dir : str
            Directory containing multiple sequence alignment of marker genes.
        output_dir : str
            Directory to store gene trees.
        extension : str
            Extension of multiple sequence alignment files.
        """

        files = os.listdir(msa_dir)
        msa_files = []
        for f in files:
            if f.endswith(extension):
                msa_file = os.path.join(msa_dir, f)
                msa_files.append(msa_file)

                fin = open(msa_file)
                data = fin.readlines()
                fin.close()

                fout = open(msa_file, 'w')
                for line in data:
                    if line[0] != '>':
                        # remove trailing star
                        if line[-1] == '*':
                            line = line[0:-1]
                    fout.write(line)
                fout.close()

        fasttree = FastTree(multithreaded=False)
        fasttree.parallel_run(msa_files, 'prot', 'wag', output_dir, self.cpus)

        # create gene tree without gene ids for visualization in ARB
        for msa_file in msa_files:
            tree_filename = ntpath.basename(msa_file)
            tree_prefix = tree_filename[0:tree_filename.find('.')]

            if tree_prefix.startswith('PF'):
                # patch up output file for Pfam trees
                old_tree_prefix = tree_prefix
                tree_prefix = '.'.join(tree_filename.split('.')[0:2])
                shutil.move(os.path.join(output_dir, old_tree_prefix + '.tree'),
                                os.path.join(output_dir, tree_prefix + '.tree'))

            gene_tree_file = os.path.join(output_dir, tree_prefix + '.tree')
            gene_tree = dendropy.Tree.get_from_path(gene_tree_file, schema='newick', rooting='force-unrooted', preserve_underscores=True)

            # rename nodes to contain only genome id
            for node in gene_tree.leaf_nodes():
                genome_id = node.taxon.label.split(DefaultValues.SEQ_CONCAT_CHAR)[0]
                node.taxon.label = genome_id

            output_tree_file = os.path.join(output_dir, tree_prefix + '.genome_ids.tree')
            gene_tree.write_to_path(output_tree_file, schema='newick', suppress_rooting=True, unquoted_underscores=True)
Example #6
0
    def run(self, input_tree, msa_file, tree_program, prot_model,
            num_replicates, output_dir):
        """Calculate bootstraps.

        Calculate support for tree using  the non-parametric
        bootstrap methods.

        Parameters
        ----------
        input_tree : str
            Tree requiring bootstrap support values.
        msa_file : str
            Multiple sequence alignment used to infer input tree (fasta format).
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG'].
        num_replicates : str
            Number of bootstrap replicates to perform.
        output_tree : float
            Output tree with bootstrap values.
        """

        if tree_program == 'fasttree':
            self.logger.info(
                'Calculating bootstraps with FastTree under %s+GAMMA.' %
                prot_model)
            ft = FastTree(multithreaded=False)
            ft.bootstrap(input_tree, msa_file, 'prot', prot_model,
                         num_replicates, output_dir, self.cpus)
        elif tree_program == 'raxml':
            self.logger.info(
                'Calculating bootstraps with RAxML under PROTGAMMA%s.' %
                prot_model)
            raxml = RAxML(cpus=1)
            raxml.bootstrap(input_tree, msa_file, prot_model, num_replicates,
                            output_dir, self.cpus)
Example #7
0
    def run(self, genome_id_file, marker_id_file, model, output_dir):
        """Identify phylogenetic tree.

        Parameters
        ----------
        genome_id_file : str
            File specifying unique ids of genomes to include in tree.
        marker_id_file : str
            File specifying unique ids of marker genes  to use for inference.
        model : str ['wag' or 'jtt']
            Model of evolution to use.
        output_dir : str
            Directory to store results.
        """

        time_keeper = TimeKeeper()

        output_alignment_dir = os.path.join(output_dir, 'alignments')
        make_sure_path_exists(output_alignment_dir)

        output_model_dir = os.path.join(output_dir, 'hmm_models')
        make_sure_path_exists(output_model_dir)

        # read directory for each genome
        genome_dirs = read_genome_dir_file(self.genome_dir_file)

        # read genomes within the ingroup
        ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file)
        genome_ids = ncbi_genome_ids.union(user_genome_ids)
        self.logger.info('Inferring tree for %d genomes.' % len(genome_ids))
        self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids))
        self.logger.info('User genomes: %d' % len(user_genome_ids))

        # get marker genes
        self.logger.info('Reading marker genes.')
        marker_genes = read_marker_id_file(marker_id_file)
        self.logger.info('Read %d marker genes.' % len(marker_genes))

        # gather all single-copy HMMs into a single model file
        hmm_model_out = os.path.join(output_dir, 'phylo.hmm')
        hmm_info_out = os.path.join(output_dir, 'phylo.tsv')
        self.logger.info('Generating marker gene HMM model files.')
        self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out,
                                  output_model_dir)

        # align gene sequences
        align_markers = AlignMarkers(self.cpus)
        align_markers.run(genome_ids, genome_dirs, marker_genes, True,
                          output_alignment_dir, output_model_dir)

        # create concatenated alignment file
        self.logger.info('Concatenating alignments.')
        concatenated_alignment_file = os.path.join(
            output_dir, 'concatenated_alignment.faa')
        marker_file = os.path.join(output_dir, 'concatenated_markers.tsv')
        create_concatenated_alignment(genome_ids, marker_genes,
                                      output_alignment_dir,
                                      concatenated_alignment_file, marker_file)

        # create concatenated genome tree
        self.logger.info('Inferring concatenated genome tree.')
        concatenated_tree = os.path.join(output_dir, 'concatenated.tree')
        concatenated_tree_log = os.path.join(output_dir,
                                             'concatenated.tree.log')
        log_file = os.path.join(output_dir, 'fasttree.log')
        fast_tree = FastTree(multithreaded=True)
        fast_tree.run(concatenated_alignment_file, 'prot', model,
                      concatenated_tree, concatenated_tree_log, log_file)

        # generate summary report
        report_out = os.path.join(output_dir, 'infer_workflow.log')
        fout = open(report_out, 'w')
        fout.write('[infer]\n')
        fout.write('Genome Id file: %s\n' % genome_id_file)
        fout.write('Marker Id file: %s\n' % marker_id_file)
        fout.write('Model of evolution: %s\n' % model)
        fout.write(time_keeper.get_time_stamp())
        fout.close()
Example #8
0
    def run(self, msa_file, tree_program, prot_model, skip_rooting,
            output_dir):
        """Infer tree.

        Parameters
        ----------
        msa_file : str
          Multiple sequence alignment in fasta format.
        tree_program : str
          Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
          Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir : str
          Directory to store results.
        """

        num_seqs = sum([1 for _, _ in seq_io.read_seq(msa_file)])
        if num_seqs <= 2:
            self.logger.error(
                'Insufficient number of sequences in MSA to infer tree.')
            raise SystemExit('Tree inference failed.')

        output_file = ntpath.basename(msa_file)
        prefix = output_file[0:output_file.rfind('.')]
        suffix = output_file[output_file.rfind('.') + 1:]

        if tree_program == 'fasttree':
            self.logger.info(
                'Inferring gene tree with FastTree using %s+GAMMA.' %
                prot_model)
            fasttree = FastTree(multithreaded=(self.cpus > 1))

            tree_unrooted_output = os.path.join(output_dir,
                                                prefix + '.unrooted.tree')
            tree_log = os.path.join(output_dir, prefix + '.tree.log')
            tree_output_log = os.path.join(output_dir, 'fasttree.log')
            fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output,
                         tree_log, tree_output_log)
        elif tree_program == 'raxml':
            self.logger.info(
                'Inferring gene tree with RAxML using PROTGAMMA%s.' %
                prot_model)

            # create phylip MSA file
            phylip_msa_file = msa_file.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file)
            os.system(cmd)

            # run RAxML
            raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml'))
            tree_output_log = os.path.join(output_dir, 'raxml.log')

            raxml = RAxML(self.cpus)
            tree_unrooted_output = raxml.run(phylip_msa_file, prot_model,
                                             raxml_dir)

        # root tree at midpoint
        if not skip_rooting:
            seqs = seq_io.read(msa_file)
            if len(seqs) > 2:
                self.logger.info('Rooting tree at midpoint.')
                tree = dendropy.Tree.get_from_path(tree_unrooted_output,
                                                   schema='newick',
                                                   rooting="force-rooted",
                                                   preserve_underscores=True)
                tree.reroot_at_midpoint(update_bipartitions=False)

            tree_output = os.path.join(output_dir, prefix + '.rooted.tree')
            tree.write_to_path(tree_output,
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)
        else:
            tree_output = tree_unrooted_output

        return tree_output
Example #9
0
    def run(self, gene_dirs, min_per_gene, min_per_bps, tree_program,
            prot_model, split_chars, output_dir):
        """Infer concatenated gene tree.

        Parameters
        ----------
        gene_dirs : list
            GeneTreeTk output directories with information for individual genes.
        min_per_gene : float
            Minimum percentage of genes required to retain taxa.
        min_per_bps : float
            Minimum percentage of base pairs required to retain taxa.
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir : str
            Directory to store results.
        """

        # read MSA files
        concat = defaultdict(lambda: defaultdict(list))
        msa_length = 0
        gene_lengths = {}
        for gene_dir in gene_dirs:
            homologs = os.path.join(gene_dir, 'homologs.trimmed.aligned.faa')

            for seq_id, seq in seq_io.read_seq(homologs):
                taxon_id, gene_id = self._split_ids(seq_id, split_chars)
                if not taxon_id:
                    self.logger.error('Failed to split identifier: %s' %
                                      seq_id)
                    sys.exit(-1)

                concat[taxon_id][gene_dir].append(seq)

            msa_length += len(seq)
            gene_lengths[gene_dir] = len(seq)

        # filter taxon
        mc_filter = set()
        min_per_gene_filter = set()
        min_per_bps_filter = set()
        for taxon_id in concat:
            # check if multiple copy
            missing = 0
            taxon_msa_len = 0
            for gene_id in gene_dirs:
                if gene_id not in concat[taxon_id]:
                    missing += 1
                    continue

                if len(concat[taxon_id][gene_id]) > 1:
                    mc_filter.add(taxon_id)
                    break

                taxon_msa_len += len(concat[taxon_id][gene_id][0])

            if taxon_id not in mc_filter:
                if missing > len(gene_dirs) * (1.0 -
                                               float(min_per_gene) / 100.0):
                    min_per_gene_filter.add(taxon_id)
                elif taxon_msa_len < msa_length * float(min_per_bps) / 100.0:
                    min_per_bps_filter.add(taxon_id)

        min_req_genes = math.ceil(len(gene_dirs) * float(min_per_gene) / 100.0)

        filtered_taxa = mc_filter.union(min_per_gene_filter).union(
            min_per_bps_filter)
        remaining_taxa = set(concat) - filtered_taxa
        self.logger.info('No. genes: %d' % len(gene_dirs))
        self.logger.info('No. taxa across all genes: %d' % len(concat))
        self.logger.info('Total filtered taxa: %d' % len(filtered_taxa))
        self.logger.info('  Due to multi-copy genes: %d' % len(mc_filter))
        self.logger.info('  Due to having <%d of the genes: %d' %
                         (min_req_genes, len(min_per_gene_filter)))
        self.logger.info('  Due to an insufficient number of base pairs: %d' %
                         len(min_per_bps_filter))
        self.logger.info('Remaining taxa: %d' % len(remaining_taxa))
        self.logger.info('Length of concatenated MSA: %d' % msa_length)

        # create the multiple sequences alignment
        msa_file = os.path.join(output_dir, 'concatenated.faa')
        fout = open(msa_file, 'w')
        for taxon_id in remaining_taxa:
            msa = ''
            for gene_id in gene_dirs:
                if gene_id not in concat[taxon_id]:
                    msa += '-' * gene_lengths[gene_id]
                else:
                    msa += concat[taxon_id][gene_id][0]

            fout.write('>%s\n' % taxon_id)
            fout.write('%s\n' % msa)
        fout.close()

        # read all taxonomy files
        # (assumes taxonomy is the same for taxa across all genes)
        taxonomy = {}
        for gene_id in gene_dirs:
            taxonomy_file = os.path.join(gene_id, 'taxonomy.tsv')
            t = Taxonomy().read(taxonomy_file)
            for label, taxa_str in t.iteritems():
                taxon_id, gene_id = self._split_ids(label, split_chars)
                taxonomy[taxon_id] = taxa_str

        # create taxonomy file for retained taxa
        self.logger.info('Creating taxonomy file for retained taxa.')
        output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        fout = open(output_taxonomy_file, 'w')
        for taxon_id in remaining_taxa:
            if taxon_id in taxonomy:  # query genomes will generally be missing
                fout.write('%s\t%s\n' %
                           (taxon_id, ';'.join(taxonomy[taxon_id])))
        fout.close()

        # infer tree
        if tree_program == 'fasttree':
            self.logger.info(
                'Inferring gene tree with FastTree using %s+GAMMA.' %
                prot_model)
            fasttree = FastTree(multithreaded=(self.cpus > 1))

            tree_unrooted_output = os.path.join(output_dir,
                                                'concatenated.unrooted.tree')
            tree_log = os.path.join(output_dir, 'concatenated.tree.log')
            tree_output_log = os.path.join(output_dir, 'fasttree.log')
            fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output,
                         tree_log, tree_output_log)
        elif tree_program == 'raxml':
            self.logger.info(
                'Inferring gene tree with RAxML using PROTGAMMA%s.' %
                prot_model)

            # create phylip MSA file
            phylip_msa_file = msa_file.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file)
            os.system(cmd)

            # run RAxML
            raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml'))
            tree_output_log = os.path.join(output_dir, 'raxml.log')

            raxml = RAxML(self.cpus)
            tree_unrooted_output = raxml.run(phylip_msa_file, prot_model,
                                             raxml_dir)

        # root tree at midpoint
        self.logger.info('Rooting tree at midpoint.')
        tree = dendropy.Tree.get_from_path(tree_unrooted_output,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)
        if len(remaining_taxa) > 2:
            tree.reroot_at_midpoint(update_bipartitions=False)
        tree_output = os.path.join(output_dir, 'concatenated.rooted.tree')
        tree.write_to_path(tree_output,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # create tax2tree consensus map and decorate tree
        t2t_tree = os.path.join(output_dir, 'concatenated.tax2tree.tree')
        cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file,
                                                  tree_output, t2t_tree)
        os.system(cmd)

        # setup metadata for ARB file
        src_dir = os.path.dirname(os.path.realpath(__file__))
        version_file = open(os.path.join(src_dir, 'VERSION'))

        metadata = {}
        metadata['genetreetk_version'] = version_file.read().strip()

        metadata['genetreetk_tree_program'] = tree_program
        metadata['genetreetk_tree_prot_model'] = prot_model

        # create ARB metadata file
        self.logger.info('Creating ARB metadata file.')
        arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt')
        self.create_arb_metadata(msa_file, taxonomy, metadata,
                                 arb_metadata_file)