Example #1
0
    def remove_outliers(self, genome_file, outlier_file, out_genome):
        """Remove sequences specified as outliers.

        Any scaffolds lists in the first column of
        the outlier file are removed from the specified
        genome.

        Parameters
        ----------
        genome_file : str
            Fasta file of binned scaffolds.
        outlier_file : str
            File specifying outlying scaffolds.
        out_genome : str
            Name of output genome.
        """

        genome_seqs = seq_io.read(genome_file)

        # remove scaffolds
        with open(outlier_file) as f:
            f.readline()

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                genome_seqs.pop(scaffold_id, None)

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
Example #2
0
    def add_compatible(self, scaffold_file, genome_file, compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = set()
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Median genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Scaffold coverage')
            genome_cov_index = headers.index('Median genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                if bin_id == cur_bin_id:
                    scaffold_ids.add(scaffold_id)

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in scaffold_ids:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
Example #3
0
    def create_records(self, metadata_file, msa_file, genome_list,
                       output_file):
        """Create ARB records from GTDB metadata."""

        seqs = {}
        if msa_file:
            seqs = seq_io.read(msa_file)

        genomes_to_keep = set()
        if genome_list:
            for line in open(genome_list):
                genomes_to_keep.add(line.strip())

        fout = open(output_file, 'w')

        header = True
        for row in csv.reader(open(metadata_file, 'rb')):
            if header:
                fields = row[1:]
                header = False
            else:
                genome_id = row[0]
                values = row[1:]
                aligned_seq = seqs.get(genome_id, '')

                if not genomes_to_keep or genome_id in genomes_to_keep:
                    self._record(fout, genome_id, fields, values, aligned_seq)

        fout.close()
Example #4
0
    def add_compatible_unique(self, scaffold_file, genome_file,
                              compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        Only sequences specified exactly once in the
        compatibility file are added.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine scaffolds compatible with genome
        scaffold_ids = []
        bin_ids = {}
        with open(compatible_file) as f:
            f.readline()

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_ids.append(scaffold_id)
                bin_ids[scaffold_id] = bin_id

        compatible_scaffolds = set()
        for scaffold_id, bin_id in bin_ids.iteritems():
            if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        self.logger.info('Identified %d compatible scaffolds.' %
                         len(compatible_scaffolds))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1

        self.logger.info('Added %d scaffolds meeting length criterion.' %
                         added_seqs)

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
    def generate(self, genome_file, contig_break):
        """Derive metdata across nucleotide sequences.

        Parameters
        ----------
        genome_file : str
            Name of fasta file containing nucleotide sequences.
        contig_break : int
            Minimum number of ambiguous bases for defining contigs.

        Returns
        -------
        dict : d[metadata_field] -> value
            Map of metadata fields to their respective values.
        dict : d[metadata_field -> description
            Description of each metadata field.
        """

        # calculate nucleotide statistics
        scaffolds = seq_io.read(genome_file)

        nuc_stats = {}
        nuc_desc = {}

        nuc_stats['scaffold_count'] = len(scaffolds)
        nuc_desc['scaffold_count'] = "Number of scaffolds in genome."
        nuc_stats['gc_count'] = genome_tk.gc_count(scaffolds)
        nuc_desc['gc_count'] = "Number of G or C bases in genome."
        nuc_stats['gc_percentage'] = genome_tk.gc(scaffolds) * 100.0
        nuc_desc['gc_percentage'] = "GC content of genome."
        nuc_stats['genome_size'] = sum([len(x) for x in scaffolds.values()])
        nuc_desc['genome_size'] = "Total base pairs in genome including nucleotide bases, ambiguous bases, and gaps."
        nuc_stats['n50_scaffolds'] = seq_tk.N50(scaffolds)
        nuc_desc['n50_scaffolds'] = "Scaffold length at which 50% of total bases in assembly are in scaffolds of that length or greater."
        nuc_stats['l50_scaffolds'] = seq_tk.L50(scaffolds, nuc_stats['n50_scaffolds'])
        nuc_desc['l50_scaffolds'] = "Number of scaffolds longer than, or equal to, the scaffold N50 length."
        nuc_stats['mean_scaffold_length'] = seq_tk.mean_length(scaffolds)
        nuc_desc['mean_scaffold_length'] = "Mean length of scaffolds in base pairs."
        nuc_stats['longest_scaffold'] = seq_tk.max_length(scaffolds)
        nuc_desc['longest_scaffold'] = "Number of bases in longest scaffold."

        contigs = seq_tk.identify_contigs(scaffolds, 'N' * contig_break)
        nuc_stats['contig_count'] = len(contigs)
        nuc_desc['contig_count'] = "Number of contigs in genome."
        nuc_stats['ambiguous_bases'] = genome_tk.ambiguous_nucleotides(contigs)
        nuc_desc['ambiguous_bases'] = "Number of ambiguous bases in contigs."
        nuc_stats['total_gap_length'] = genome_tk.ambiguous_nucleotides(scaffolds) - nuc_stats['ambiguous_bases']
        nuc_desc['total_gap_length'] = "Number of ambiguous bases comprising gaps in scaffolds."
        nuc_stats['n50_contigs'] = seq_tk.N50(contigs)
        nuc_desc['n50_contigs'] = "Contig length at which 50% of total bases in assembly are in contigs of that length or greater."
        nuc_stats['l50_contigs'] = seq_tk.L50(contigs, nuc_stats['n50_contigs'])
        nuc_desc['l50_contigs'] = "Number of contigs longer than, or equal to, the contig N50 length."
        nuc_stats['mean_contig_length'] = seq_tk.mean_length(contigs)
        nuc_desc['mean_contig_length'] = "Mean length of contigs in base pairs."
        nuc_stats['longest_contig'] = seq_tk.max_length(contigs)
        nuc_desc['longest_contig'] = "Number of bases in longest contig."

        return nuc_stats, nuc_desc
    def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep,
            num_replicates, model, output_dir):
        """Jackknife taxa.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        outgroup_file : str
          File indicating labels of outgroup taxa.
        perc_taxa_to_keep : float
          Percentage of taxa to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          input_tree directory for bootstrap trees.
        """

        assert (model in ['wag', 'jtt'])

        self.perc_taxa_to_keep = perc_taxa_to_keep
        self.model = model
        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)

        # read outgroup taxa
        self.outgroup_ids = set()
        if outgroup_file:
            for line in open(outgroup_file):
                self.outgroup_ids.add(line.strip())

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        #***self.logger.info('Calculating jackknife taxa replicates:')
        #***parallel = Parallel(self.cpus)
        #***parallel.run(self._producer, None, range(num_replicates), self._progress)

        # calculate support
        rep_tree_files = []
        for rep_index in range(num_replicates):
            rep_tree_files.append(
                os.path.join(self.replicate_dir,
                             'jk_taxa.tree.' + str(rep_index) + '.tre'))

        tree_support = TreeSupport()
        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.jk_taxa.tree')
        tree_support.subset_taxa(input_tree, rep_tree_files, output_tree)

        return output_tree
Example #7
0
File: ssu.py Project: wwood/RefineM
    def extract(self, genome_files, best_hits, output_dir):
        """Extract 16S rRNA genes.

        Parameters
        ----------
        genome_files : iterable
            Path to genome files to process.
        best_hits : d[genome_id][seq_id] -> information about best hit
            Information about best hits for each genome.
        output_dir : str
            Output directory.

        Returns
        -------
        d[genome_id] -> str
            Fasta file containing SSU sequences for each genome.
        """

        self.logger.info('Extracting SSU rRNA genes.')
        ssu_seq_files = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            genome_dir = os.path.join(output_dir, genome_id)

            if len(best_hits[genome_id]) == 0:
                continue

            # write summary file and putative SSU rRNAs to file
            summary_file = os.path.join(genome_dir, 'ssu.hmm_summary.tsv')
            summary_out = open(summary_file, 'w')
            summary_out.write(
                'Sequence Id\tHMM\ti-Evalue\tStart hit\tEnd hit\tSSU gene length\tReverse Complement\tSequence length\n'
            )

            ssu_seq_files[genome_id] = os.path.join(genome_dir, 'ssu.fna')
            seq_out = open(ssu_seq_files[genome_id], 'w')

            seqs = seq_io.read(genome_file)

            for seq_id in best_hits[genome_id]:
                orig_seq_id = seq_id
                if '-#' in seq_id:
                    seq_id = seq_id[0:seq_id.rfind('-#')]

                seq_info = [orig_seq_id] + best_hits[genome_id][orig_seq_id]
                seq = seqs[seq_id]
                summary_out.write('\t'.join(seq_info) + '\n')

                seq_out.write('>' + seq_info[0] + '\n')
                seq_out.write(seq[int(seq_info[3]) + 1:int(seq_info[4]) + 1] +
                              '\n')

            summary_out.close()
            seq_out.close()

        return ssu_seq_files
Example #8
0
    def run(self, input_tree, msa_file, num_replicates, model, base_type, frac,
            output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert (model in ['wag', 'lg', 'jtt'])
        assert (base_type in ['nt', 'prot'])

        self.model = model
        self.base_type = base_type
        self.frac = frac

        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        self.logger.info('Calculating bootstrap replicates:')
        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, range(num_replicates),
                     self._progress)

        # calculate support values
        rep_tree_files = []
        for rep_index in range(num_replicates):
            rep_tree_files.append(
                os.path.join(self.replicate_dir,
                             'bootstrap_tree.r_' + str(rep_index) + '.tree'))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
    def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep, num_replicates, model, output_dir):
        """Jackknife taxa.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        outgroup_file : str
          File indicating labels of outgroup taxa.
        perc_taxa_to_keep : float
          Percentage of taxa to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          input_tree directory for bootstrap trees.
        """

        assert(model in ['wag', 'jtt'])

        self.perc_taxa_to_keep = perc_taxa_to_keep
        self.model = model
        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)
        # read outgroup taxa
        self.outgroup_ids = set()
        if outgroup_file:
            for line in open(outgroup_file):
                self.outgroup_ids.add(line.strip())

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        #***self.logger.info('Calculating jackknife taxa replicates:')
        #***parallel = Parallel(self.cpus)
        #***parallel.run(self._producer, None, xrange(num_replicates), self._progress)

        # calculate support
        rep_tree_files = []
        for rep_index in xrange(num_replicates):
            rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(rep_index) + '.tre'))

        tree_support = TreeSupport()
        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_taxa.tree')
        tree_support.subset_taxa(input_tree, rep_tree_files, output_tree)

        return output_tree
Example #10
0
    def generate(self, genome_file, gff_file):
        """Derive metdata from gene sequences.

        Parameters
        ----------
        genome_file : str
            Name of fasta file containing nucleotide sequences.
        gff_file : str
            Name of generic feature file describing genes.

        Returns
        -------
        dict : d[metadata_field] -> value
            Map of metadata fields to their respective values.
        dict : d[metadata_field -> description
            Description of each metadata field.
        """

        gff_parser = GenericFeatureParser(gff_file)
        coding_bases = gff_parser.total_coding_bases()

        # calculate nucleotide statistics
        scaffolds = seq_io.read(genome_file)
        genome_size = sum([len(x) for x in list(scaffolds.values())])

        gene_stats = {}
        gene_desc = {}

        gene_stats['protein_count'] = gff_parser.cds_count
        gene_desc['protein_count'] = "Number of protein coding genes."

        gene_stats['tRNA_count'] = gff_parser.tRNA_count
        gene_desc['tRNA_count'] = "Number of tRNA genes."

        gene_stats['ncRNA_count'] = gff_parser.ncRNA_count
        gene_desc['ncRNA_count'] = "Number of ncRNA genes."

        gene_stats['rRNA_count'] = gff_parser.rRNA_count
        gene_desc['rRNA_count'] = "Number of rRNA genes."

        gene_stats['16S_count'] = gff_parser.rRNA_16S_count
        gene_desc['16S_count'] = "Number of 16S rRNA genes."

        gene_stats['coding_bases'] = coding_bases
        gene_desc['coding_bases'] = "Number of coding bases in genome."

        gene_stats['coding_density'] = float(
            coding_bases) * 100.0 / genome_size
        gene_desc['coding_density'] = "Percentage of coding bases in genome."

        return gene_stats, gene_desc
Example #11
0
def modify(input_file, scaffold_file, seqs_to_add, seqs_to_remove,
           output_file):
    """Add or remove scaffolds from a fasta file.

    Parameters
    ----------
    input_file : str
        Fasta file to modify.
    scaffold_file : str
        Fasta file containing scaffolds to add.
    seqs_to_add: iterable
        Unique ids of scaffolds to add.
    seqs_to_remove : iterable
        Unique ids of scaffolds to remove.
    output_file : str
        Desired name of modified fasta file.

    Returns
    -------
    iterable, iterable
        Unique ids of sequences that could not be added,
        unique ids of sequences that could not be removed.
    """

    seqs = seq_io.read(input_file)

    # add sequences to bin
    failed_to_add = set()
    if seqs_to_add:
        failed_to_add = set(seqs_to_add)
        if seqs_to_add != None:
            for seq_id, seq in seq_io.read_seq(scaffold_file):
                if seq_id in seqs_to_add:
                    failed_to_add.remove(seq_id)
                    seqs[seq_id] = seq

    # remove sequences from bin
    failed_to_remove = set()
    if seqs_to_remove:
        failed_to_remove = set(seqs_to_remove)
        if seqs_to_remove != None:
            for seq_id in seqs_to_remove:
                if seq_id in seqs:
                    failed_to_remove.remove(seq_id)
                    seqs.pop(seq_id)

    # save modified bin
    seq_io.write_fasta(seqs, output_file)

    return failed_to_add, failed_to_remove
Example #12
0
    def bootstrap(self, input_tree, msa_file, seq_type, model_str, gamma,
                  num_replicates, output_dir, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        seq_type : str
            Specifies multiple sequences alignment is of 'nt' or 'prot'.
        model_str : str
            Specified either the 'wag' or 'jtt' model.
        gamma : bool
            Indicates if GAMMA model should be used
        num_replicates : int
            Number of replicates to perform.
        output_dir: str
            Output directory to contain bootstrap trees.
        cpus : int
            Number of cpus to use.
        """

        assert (seq_type.upper() in ['NT', 'PROT'])
        assert (model_str.upper() in ['WAG', 'LG', 'JTT', 'GTR'])

        self.output_dir = output_dir
        self.seq_type = seq_type
        self.model = model_str
        self.gamma = gamma
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        replicate_numbers = list(range(num_replicates))
        parallel.run(self._bootstrap, None, replicate_numbers, None)

        # calculate support values
        rep_tree_files = []
        for rep_index in replicate_numbers:
            rep_tree_files.append(
                os.path.join(self.output_dir, 'rep_%d' % rep_index,
                             'bootstrap.tree'))

        tree_name = os.path.splitext(os.path.basename(input_tree))[0]
        output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Example #13
0
    def generate(self, genome_file, gff_file):
        """Derive metdata from gene sequences.

        Parameters
        ----------
        genome_file : str
            Name of fasta file containing nucleotide sequences.
        gff_file : str
            Name of generic feature file describing genes.

        Returns
        -------
        dict : d[metadata_field] -> value
            Map of metadata fields to their respective values.
        dict : d[metadata_field -> description
            Description of each metadata field.
        """

        gff_parser = GenericFeatureParser(gff_file)
        coding_bases = gff_parser.total_coding_bases()

        # calculate nucleotide statistics
        scaffolds = seq_io.read(genome_file)
        genome_size = sum([len(x) for x in scaffolds.values()])

        gene_stats = {}
        gene_desc = {}

        gene_stats['protein_count'] = gff_parser.cds_count
        gene_desc['protein_count'] = "Number of protein coding genes."

        gene_stats['tRNA_count'] = gff_parser.tRNA_count
        gene_desc['tRNA_count'] = "Number of tRNA genes."

        gene_stats['ncRNA_count'] = gff_parser.ncRNA_count
        gene_desc['ncRNA_count'] = "Number of ncRNA genes."

        gene_stats['rRNA_count'] = gff_parser.rRNA_count
        gene_desc['rRNA_count'] = "Number of rRNA genes."

        gene_stats['16S_count'] = gff_parser.rRNA_16S_count
        gene_desc['16S_count'] = "Number of 16S rRNA genes."

        gene_stats['coding_bases'] = coding_bases
        gene_desc['coding_bases'] = "Number of coding bases in genome."

        gene_stats['coding_density'] = float(coding_bases) * 100.0 / genome_size
        gene_desc['coding_density'] = "Percentage of coding bases in genome."

        return gene_stats, gene_desc
Example #14
0
    def _extract(self, genome_file, best_hits, output_dir):
        """Extract rRNA genes.

        Parameters
        ----------
        genome_file : str
            Name of fasta file containing nucleotide sequences.
        best_hits : d[seq_id] -> information about best hit
            Information about best hits.
        output_dir : str
            Output directory.

        Returns
        -------
        str
            Name of fasta file containing extractracted sequences.
        """

        # write summary file and putative SSU rRNAs to file
        summary_file = os.path.join(output_dir,
                                    '%s.hmm_summary.tsv' % self.rna_name)
        summary_out = open(summary_file, 'w')
        summary_out.write(
            'Sequence Id\tHMM\ti-Evalue\tStart hit\tEnd hit\tSSU gene length\tReverse Complement\tSequence length\n'
        )

        ssu_seq_file = os.path.join(output_dir, '%s.fna' % self.rna_name)
        seq_out = open(ssu_seq_file, 'w')

        seqs = seq_io.read(genome_file)

        for seq_id in best_hits:
            orig_seq_id = seq_id
            if '-#' in seq_id:
                seq_id = seq_id[0:seq_id.rfind('-#')]

            seq_info = [orig_seq_id] + best_hits[orig_seq_id]
            seq = seqs[seq_id]
            summary_out.write('\t'.join(seq_info) + '\t' + str(len(seq)) +
                              '\n')

            seq_out.write('>' + seq_info[0] + '\n')
            seq_out.write(seq[int(seq_info[3]) + 1:int(seq_info[4]) + 1] +
                          '\n')

        summary_out.close()
        seq_out.close()

        return ssu_seq_file
Example #15
0
    def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, out_genome):
        """Add sequences specified as compatible.

        Only sequences specified exactly once in the
        compatibility file are added.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine scaffolds compatible with genome
        scaffold_ids = []
        bin_ids = {}
        with open(compatible_file) as f:
            f.readline()

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_ids.append(scaffold_id)
                bin_ids[scaffold_id] = bin_id

        compatible_scaffolds = set()
        for scaffold_id, bin_id in bin_ids.iteritems():
            if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        # add compatible sequences to genome
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                genome_seqs[seq_id] = seq

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
Example #16
0
    def create_records(self, metadata_file, msa_file, taxonomy_file,
                       genome_list, output_file):
        """Create ARB records from GTDB metadata."""

        seqs = {}
        if msa_file:
            seqs = seq_io.read(msa_file)

        taxonomy = {}
        if taxonomy_file:
            taxonomy = Taxonomy().read(taxonomy_file)

        genomes_to_keep = set()
        if genome_list:
            for line in open(genome_list):
                genomes_to_keep.add(line.strip())

        fout = open(output_file, 'w')

        delimiter = ','
        if metadata_file.endswith('.tsv'):
            delimiter = '\t'

        header = True
        for row in csv.reader(open(metadata_file, 'rb'), delimiter=delimiter):
            if header:
                fields = [
                    f.lower().replace(' ', '_').replace('-', '_')
                    for f in row[1:]
                ]
                if taxonomy:
                    fields.append('gtdb_taxonomy')
                header = False
            else:
                genome_id = row[0]
                values = row[1:]
                if taxonomy:
                    values.append('; '.join(taxonomy[genome_id]))
                aligned_seq = seqs.get(genome_id, '')

                if not genomes_to_keep or genome_id in genomes_to_keep:
                    self._record(fout, genome_id, fields, values, aligned_seq)

        fout.close()
Example #17
0
    def bootstrap(self, input_tree, msa_file, model_str, num_replicates, output_dir, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        model_str : str
            Specified either the 'WAG' or 'LG' model.
        num_replicates : int
            Number of replicates to perform.
        output_dir: str
            Output directory to contain bootstrap trees.
        cpus : int
            Number of cpus to use.
        """
        
        check_on_path('seqmagick')

        assert(model_str.upper() in ['WAG', 'LG'])

        self.output_dir = output_dir
        self.model = model_str
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        replicate_numbers = list(range(num_replicates))
        parallel.run(self._bootstrap, None, replicate_numbers, None)

        # calculate support values
        rep_tree_files = []
        for rep_index in replicate_numbers:
            rep_tree_files.append(os.path.join(output_dir, 'rep_%d' % rep_index, 'RAxML_bestTree.support'))

        tree_name = os.path.splitext(os.path.basename(input_tree))[0]
        output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)
        
        return output_tree
Example #18
0
    def remove_outliers(self, genome_file, outlier_file, out_genome,
                        modified_only):
        """Remove sequences specified as outliers.

        Any scaffolds lists in the first column of
        the outlier file are removed from the specified
        genome.

        Parameters
        ----------
        genome_file : str
            Fasta file of binned scaffolds.
        outlier_file : str
            File specifying outlying scaffolds.
        out_genome : str
            Name of output genome.
        modified_only : bool
            Only create output file if genome is modified.
        """

        genome_seqs = seq_io.read(genome_file)
        if not genome_seqs:
            return

        # remove scaffolds
        bModified = False
        with open(outlier_file) as f:
            f.readline()

            for line in f:
                if line[0] == '#':
                    continue

                line_split = line.split('\t')
                scaffold_id = line_split[0]
                rtn = genome_seqs.pop(scaffold_id, None)
                if rtn:
                    bModified = True

        # save modified bin
        if bModified or not modified_only:
            seq_io.write_fasta(genome_seqs, out_genome)
Example #19
0
    def bootstrap(self, input_tree, msa_file, seq_type, model_str, num_replicates, output_tree, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        seq_type : str
            Specifies multiple sequences alignment is of 'nt' or 'prot'.
        model_str : str
            Specified either the 'wag' or 'jtt' model.
        num_replicates : int
            Number of replicates to perform.
        output_tree: str
            Output file containing tree with bootstrap values.
        cpus : int
            Number of cpus to use.
        """

        assert(seq_type in ['nt', 'prot'])
        assert(model_str in ['wag', 'jtt'])

        self.replicate_dir = tempfile.mkdtemp()
        self.seq_type = seq_type
        self.model = model_str
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        parallel.run(self._bootstrap, None, xrange(num_replicates), None)

        # calculate support values
        rep_tree_files = []
        for rep_index in xrange(num_replicates):
            rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap.tree.' + str(rep_index) + '.tre'))

        bootstrap_support(input_tree, rep_tree_files, output_tree)

        shutil.rmtree(self.replicate_dir)
Example #20
0
    def create_records(self, metadata_file, msa_file, taxonomy_file, genome_list, output_file):
        """Create ARB records from GTDB metadata."""
        
        seqs = {}
        if msa_file:
            seqs = seq_io.read(msa_file)
        
        taxonomy = {}
        if taxonomy_file:
            taxonomy = Taxonomy().read(taxonomy_file)
            
        genomes_to_keep = set()
        if genome_list:
            for line in open(genome_list):
                genomes_to_keep.add(line.strip())
                
        fout = open(output_file, 'w')
        
        delimiter = ','
        if metadata_file.endswith('.tsv'):
            delimiter = '\t'
        
        header = True
        for row in csv.reader(open(metadata_file, 'rb'), delimiter=delimiter):
            if header:
                fields = [f.lower().replace(' ', '_').replace('-', '_') for f in row[1:]]
                if taxonomy:
                    fields.append('gtdb_taxonomy')
                header = False
            else:
                genome_id = row[0]
                values = row[1:]
                if taxonomy:
                    values.append('; '.join(taxonomy[genome_id]))
                aligned_seq = seqs.get(genome_id, '')
                
                if not genomes_to_keep or genome_id in genomes_to_keep:
                    self._record(fout, genome_id, fields, values, aligned_seq)

        fout.close()
Example #21
0
    def manual(self, options):
        """Manual command"""

        check_file_exists(options.cluster_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        genome_id = remove_extension(options.genome_file)

        seqs = seq_io.read(options.genome_file)
        fout = {}
        with open(options.cluster_file) as f:
            f.readline()

            for line in f:
                line_split = line.rstrip().split('\t')
                scaffold_id = line_split[0]
                cluster_id = int(line_split[1])

                if cluster_id < 0:
                    # negative values indicate scaffolds that should
                    # not be placed in a cluster
                    continue

                if cluster_id not in fout:
                    fout[cluster_id] = open(
                        os.path.join(options.output_dir,
                                     genome_id + '_c%d.fna' % cluster_id), 'w')

                f = fout[cluster_id]
                f.write('>' + scaffold_id + '\n')
                f.write(seqs[scaffold_id] + '\n')

        for f in fout.values():
            f.close()

        self.logger.info('Partitioned sequences written to: ' +
                         options.output_dir)
Example #22
0
    def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        A sequences is added to a bin if and only if it is
        closest to that bin in GC, tetranuclotide, and
        coverage space.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = defaultdict(dict)
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Median genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Scaffold coverage')
            genome_cov_index = headers.index('Median genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist]

        # determine scaffolds that are closest to a single bin
        # in terms of GC, tetranucleotide distance, and coverage
        compatible_scaffolds = set()
        for scaffold_id, bin_stats in scaffold_ids.items():
            best_gc = [1e9, None]
            best_td = [1e9, None]
            best_cov = [1e9, None]
            for bin_id, stats in bin_stats.items():
                gc, td, cov = stats
                if gc < best_gc[0]:
                    best_gc = [gc, bin_id]
                if td < best_td[0]:
                    best_td = [td, bin_id]
                if cov < best_cov[0]:
                    best_cov = [cov, bin_id]

            # check if scaffold is closest to a single bin
            if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)
                
        self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
    def run(self, input_tree, 
                    msa_file, 
                    marker_info_file, 
                    mask_file, 
                    perc_markers_to_keep, 
                    num_replicates, 
                    model,
                    jk_dir,
                    output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert(model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep
        
        
        # determine length of each marker gene in alignment
        rep_tree_files = []
        if not jk_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)
            
            marker_lengths = []
            total_len = 0
            with open(marker_info_file) as f:
                f.readline()
                for line in f:
                    line_split = line.split('\t')
                    ml = int(line_split[3])
                    marker_lengths.append(ml)
                    total_len += ml
                    
            self.logger.info('Concatenated length of markers: %d' % total_len)
                    
            # read mask
            mask = open(mask_file).readline().strip()
            start = 0
            self.marker_lengths = []
            total_mask_len = 0
            for ml in marker_lengths:
                end = start + ml
                zeros = mask[start:end].count('0')
                start = end
                
                self.marker_lengths.append(ml - zeros)
                total_mask_len += ml - zeros
                
            self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)
            
            if len(self.msa.values()[0]) != total_mask_len:
                self.logger.error('Length of MSA does not meet length of mask.')
                sys.exit()

            # calculate replicates
            self.logger.info('Calculating jackknife marker replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates), self._progress)

            # calculate support
            self.logger.info('Calculating support for %d replicates.' % num_replicates)
            for rep_index in xrange(num_replicates):
                rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre'))
        else:
            for f in os.listdir(jk_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(jk_dir, f))
            self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files))

        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Example #24
0
    def run(self, 
                input_tree, 
                msa_file, 
                num_replicates, 
                model, 
                gamma,
                base_type, 
                frac,
                boot_dir,
                output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert(model in ['wag', 'lg', 'jtt'])
        assert(base_type in ['nt', 'prot'])

        self.model = model
        self.gamma = gamma
        self.base_type = base_type
        self.frac = frac

        rep_tree_files = []
        if not boot_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            # calculate replicates
            self.logger.info('Calculating bootstrap replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates), self._progress)

            for rep_index in xrange(num_replicates):
                rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree'))
        else:
            for f in os.listdir(boot_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(boot_dir, f))
            self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files))
          
        # calculate support values
        self.logger.info('Calculating bootstrap support values.')
        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Example #25
0
    def _dump_seqs(self, genomic_file, gtdb_taxonomy, genomes_of_interest,
                   prefix, min_ar_gene_len, min_bac_gene_len, min_contig_len,
                   output_prefix, output_dir):

        fout_ar_summary = open(
            os.path.join(output_dir, output_prefix + '_ar.tsv'), 'w')
        fout_ar_fna = open(os.path.join(output_dir, output_prefix + '_ar.fna'),
                           'w')
        fout_ar_taxonmy = open(
            os.path.join(output_dir, output_prefix + '_ar_taxonomy.tsv'), 'w')

        fout_bac_summary = open(
            os.path.join(output_dir, output_prefix + '_bac.tsv'), 'w')
        fout_bac_fna = open(
            os.path.join(output_dir, output_prefix + '_bac.fna'), 'w')
        fout_bac_taxonmy = open(
            os.path.join(output_dir, output_prefix + '_bac_taxonomy.tsv'), 'w')

        write_header = True
        total_seq = 0
        for line in open(genomic_file):
            gid, genome_path = [t.strip() for t in line.split()]
            if gid.startswith('GCA_'):
                gid = 'GB_' + gid
            elif gid.startswith('GCF_'):
                gid = 'RS_' + gid

            if genomes_of_interest and gid not in genomes_of_interest:
                continue

            if 'd__Archaea' in gtdb_taxonomy[gid]:
                fout_summary = fout_ar_summary
                fout_fna = fout_ar_fna
                fout_taxonomy = fout_ar_taxonmy
                min_gene_len = min_ar_gene_len
            else:
                fout_summary = fout_bac_summary
                fout_fna = fout_bac_fna
                fout_taxonomy = fout_bac_taxonmy
                min_gene_len = min_bac_gene_len

            # extract sequences
            hmm_summary = os.path.join(genome_path,
                                       prefix + '.hmm_summary.tsv')
            if not os.path.exists(hmm_summary):
                continue

            seqs = seq_io.read(os.path.join(genome_path, prefix + '.fna'))
            gene_count = 0
            with open(hmm_summary) as f:
                header = f.readline()
                if write_header:
                    write_header = False
                    fout_summary.write('%s\t%s' % ('Gene ID', header))

                for line in f:
                    line_split = line.strip().split('\t')
                    gene_id = line_split[0]
                    gene_len = int(line_split[5])
                    contig_len = int(line_split[-1])

                    if gene_len >= min_gene_len and contig_len >= min_contig_len:
                        unique_gene_id = '%s~gene_%s' % (gid, gene_count)
                        fout_summary.write('%s\t%s' % (unique_gene_id, line))
                        fout_fna.write('>%s [%s]\n' %
                                       (unique_gene_id, gene_id))
                        fout_fna.write(seqs[gene_id] + '\n')
                        fout_taxonomy.write(
                            '%s\t%s\n' %
                            (unique_gene_id, '; '.join(gtdb_taxonomy[gid])))

                        gene_count += 1
                        total_seq += 1

        fout_ar_summary.close()
        fout_ar_fna.close()
        fout_ar_taxonmy.close()

        fout_bac_summary.close()
        fout_bac_fna.close()
        fout_bac_taxonmy.close()

        self.logger.info('Wrote %d sequences.' % total_seq)
Example #26
0
    def generate(self, genome_file, contig_break):
        """Derive metdata across nucleotide sequences.

        Parameters
        ----------
        genome_file : str
            Name of fasta file containing nucleotide sequences.
        contig_break : int
            Minimum number of ambiguous bases for defining contigs.

        Returns
        -------
        dict : d[metadata_field] -> value
            Map of metadata fields to their respective values.
        dict : d[metadata_field -> description
            Description of each metadata field.
        """

        # calculate nucleotide statistics
        scaffolds = seq_io.read(genome_file)

        nuc_stats = {}
        nuc_desc = {}

        nuc_stats['scaffold_count'] = len(scaffolds)
        nuc_desc['scaffold_count'] = "Number of scaffolds in genome."
        nuc_stats['gc_count'] = genome_tk.gc_count(scaffolds)
        nuc_desc['gc_count'] = "Number of G or C bases in genome."
        nuc_stats['gc_percentage'] = genome_tk.gc(scaffolds) * 100.0
        nuc_desc['gc_percentage'] = "GC content of genome."
        nuc_stats['genome_size'] = sum(
            [len(x) for x in list(scaffolds.values())])
        nuc_desc[
            'genome_size'] = "Total base pairs in genome including nucleotide bases, ambiguous bases, and gaps."
        nuc_stats['n50_scaffolds'] = seq_tk.N50(scaffolds)
        nuc_desc[
            'n50_scaffolds'] = "Scaffold length at which 50% of total bases in assembly are in scaffolds of that length or greater."
        nuc_stats['l50_scaffolds'] = seq_tk.L50(scaffolds,
                                                nuc_stats['n50_scaffolds'])
        nuc_desc[
            'l50_scaffolds'] = "Number of scaffolds longer than, or equal to, the scaffold N50 length."
        nuc_stats['mean_scaffold_length'] = seq_tk.mean_length(scaffolds)
        nuc_desc[
            'mean_scaffold_length'] = "Mean length of scaffolds in base pairs."
        nuc_stats['longest_scaffold'] = seq_tk.max_length(scaffolds)
        nuc_desc['longest_scaffold'] = "Number of bases in longest scaffold."

        contigs = seq_tk.identify_contigs(scaffolds, 'N' * contig_break)
        nuc_stats['contig_count'] = len(contigs)
        nuc_desc['contig_count'] = "Number of contigs in genome."
        nuc_stats['ambiguous_bases'] = genome_tk.ambiguous_nucleotides(contigs)
        nuc_desc['ambiguous_bases'] = "Number of ambiguous bases in contigs."
        nuc_stats['total_gap_length'] = genome_tk.ambiguous_nucleotides(
            scaffolds) - nuc_stats['ambiguous_bases']
        nuc_desc[
            'total_gap_length'] = "Number of ambiguous bases comprising gaps in scaffolds."
        nuc_stats['n50_contigs'] = seq_tk.N50(contigs)
        nuc_desc[
            'n50_contigs'] = "Contig length at which 50% of total bases in assembly are in contigs of that length or greater."
        nuc_stats['l50_contigs'] = seq_tk.L50(contigs,
                                              nuc_stats['n50_contigs'])
        nuc_desc[
            'l50_contigs'] = "Number of contigs longer than, or equal to, the contig N50 length."
        nuc_stats['mean_contig_length'] = seq_tk.mean_length(contigs)
        nuc_desc[
            'mean_contig_length'] = "Mean length of contigs in base pairs."
        nuc_stats['longest_contig'] = seq_tk.max_length(contigs)
        nuc_desc['longest_contig'] = "Number of bases in longest contig."

        return nuc_stats, nuc_desc
Example #27
0
    def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature.
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations : int
            Iterations of clustering to perform.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('')
        self.logger.info('  Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.iteritems():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in xrange(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of tetranucleotide signatures
        if K != 0:
            if not no_pca:
                self.logger.info('  Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info('    First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100))
    
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('  Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            print '  Whitening data.'
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info('  Partitioning genome into %d clusters.' % num_clusters)

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('    Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
    def run(self, input_tree, msa_file, marker_info_file, mask_file,
            perc_markers_to_keep, num_replicates, model, jk_dir, output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert (model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep

        # determine length of each marker gene in alignment
        rep_tree_files = []
        if not jk_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            marker_lengths = []
            total_len = 0
            with open(marker_info_file) as f:
                f.readline()
                for line in f:
                    line_split = line.split('\t')
                    ml = int(line_split[3])
                    marker_lengths.append(ml)
                    total_len += ml

            self.logger.info('Concatenated length of markers: %d' % total_len)

            # read mask
            mask = open(mask_file).readline().strip()
            start = 0
            self.marker_lengths = []
            total_mask_len = 0
            for ml in marker_lengths:
                end = start + ml
                zeros = mask[start:end].count('0')
                start = end

                self.marker_lengths.append(ml - zeros)
                total_mask_len += ml - zeros

            self.logger.info('Concatenated length of filtered MSA: %d' %
                             total_mask_len)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            if len(self.msa.values()[0]) != total_mask_len:
                self.logger.error(
                    'Length of MSA does not meet length of mask.')
                sys.exit()

            # calculate replicates
            self.logger.info('Calculating jackknife marker replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates),
                         self._progress)

            # calculate support
            self.logger.info('Calculating support for %d replicates.' %
                             num_replicates)
            for rep_index in xrange(num_replicates):
                rep_tree_files.append(
                    os.path.join(self.replicate_dir,
                                 'jk_markers.tree.' + str(rep_index) + '.tre'))
        else:
            for f in os.listdir(jk_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(jk_dir, f))
            self.logger.info('Read %d jackknife replicates.' %
                             len(rep_tree_files))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Example #29
0
    def split(self, scaffold_stats, criteria1, criteria2, genome_file,
              output_dir):
        """Split genome into two based ongenomic feature.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        criteria1 : str
            First criteria used for splitting genome.
        criteria2 : str
           Second criteria used for splitting genome.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        seqs = seq_io.read(genome_file)

        # calculate PCA if necessary
        if 'pc' in criteria1 or 'pc' in criteria2:
            self.logger.info('Performing PCA.')
            signatures = GenomicSignature(K)
            signature_matrix = []
            seqs = seq_io.read(genome_file)
            for seq_id, seq in seqs.items():
                stats = scaffold_stats.stats[seq_id]

                signature_matrix.append(stats.signature)

            pc, _variance = self.pca(signature_matrix)
            for i, seq_id in enumerate(seqs):
                scaffold_stats.stats[seq_id].pc1 = pc[i][0]
                scaffold_stats.stats[seq_id].pc2 = pc[i][1]
                scaffold_stats.stats[seq_id].pc3 = pc[i][2]

        # split bin
        genome_id = remove_extension(genome_file)
        fout1 = open(os.path.join(output_dir, genome_id + '_c1.fna'), 'w')
        fout2 = open(os.path.join(output_dir, genome_id + '_c2.fna'), 'w')

        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            meet_criteria = True
            for criteria in [criteria1, criteria2]:
                if 'gc' in criteria:
                    v = eval(criteria.replace('gc', str(stats.gc)),
                             {"__builtins__": {}})
                elif 'coverage' in criteria:
                    v = eval(criteria.replace('coverage', str(stats.coverage)),
                             {"__builtins__": {}})
                elif 'pc1' in criteria:
                    v = eval(criteria.replace('pc1', str(stats.pc1)),
                             {"__builtins__": {}})
                elif 'pc2' in criteria:
                    v = eval(criteria.replace('pc2', str(stats.pc2)),
                             {"__builtins__": {}})
                elif 'pc3' in criteria:
                    v = eval(criteria.replace('pc3', str(stats.pc3)),
                             {"__builtins__": {}})

                meet_criteria = meet_criteria and v

            if meet_criteria:
                fout1.write('>' + seq_id + '\n')
                fout1.write(seqs[seq_id] + '\n')
            else:
                fout2.write('>' + seq_id + '\n')
                fout2.write(seqs[seq_id] + '\n')

        fout1.close()
        fout2.close()
Example #30
0
    def kmeans(self, scaffold_stats, num_clusters, num_components, K,
               no_coverage, no_pca, iterations, genome_file, output_dir):
        """Cluster genome with k-means.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations: int
            iterations to perform during clustering
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in range(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of signatures
        if K != 0:
            if not no_pca:
                self.logger.info('Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info(
                    'First {:,} PCs capture {:.1f}% of the variance.'.format(
                        num_components,
                        sum(variance[0:num_components]) * 100))

                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            self.logger.info('Whitening data.')
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info(
            'Partitioning genome into {:,} clusters.'.format(num_clusters))

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats,
                                             num_clusters,
                                             iterations,
                                             minit='points',
                                             missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('Placed {:,} sequences in cluster {:,}.'.format(
                sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(
                os.path.join(output_dir,
                             genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
Example #31
0
    def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, out_genome):
        """Add sequences specified as compatible.

        A sequences is added to a bin if and only if it is
        closest to that bin in GC, tetranuclotide, and
        coverage space.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = defaultdict(dict)
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Mean genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Mean scaffold coverage')
            genome_cov_index = headers.index('Mean genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist]

        # determine scaffolds that are closest to a single bin
        # in terms of GC, tetranucleotide distance, and coverage
        compatible_scaffolds = set()
        for scaffold_id, bin_stats in scaffold_ids.iteritems():
            best_gc = [1e9, None]
            best_td = [1e9, None]
            best_cov = [1e9, None]
            for bin_id, stats in bin_stats.iteritems():
                gc, td, cov = stats
                if gc < best_gc[0]:
                    best_gc = [gc, bin_id]
                if td < best_td[0]:
                    best_td = [td, bin_id]
                if cov < best_cov[0]:
                    best_cov = [cov, bin_id]

            # check if scaffold is closest to a single bin
            if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        # add compatible sequences to genome
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                genome_seqs[seq_id] = seq

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
Example #32
0
    def run(self, msa_file, tree_program, prot_model, skip_rooting,
            output_dir):
        """Infer tree.

        Parameters
        ----------
        msa_file : str
          Multiple sequence alignment in fasta format.
        tree_program : str
          Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
          Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir : str
          Directory to store results.
        """

        num_seqs = sum([1 for _, _ in seq_io.read_seq(msa_file)])
        if num_seqs <= 2:
            self.logger.error(
                'Insufficient number of sequences in MSA to infer tree.')
            raise SystemExit('Tree inference failed.')

        output_file = ntpath.basename(msa_file)
        prefix = output_file[0:output_file.rfind('.')]
        suffix = output_file[output_file.rfind('.') + 1:]

        if tree_program == 'fasttree':
            self.logger.info(
                'Inferring gene tree with FastTree using %s+GAMMA.' %
                prot_model)
            fasttree = FastTree(multithreaded=(self.cpus > 1))

            tree_unrooted_output = os.path.join(output_dir,
                                                prefix + '.unrooted.tree')
            tree_log = os.path.join(output_dir, prefix + '.tree.log')
            tree_output_log = os.path.join(output_dir, 'fasttree.log')
            fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output,
                         tree_log, tree_output_log)
        elif tree_program == 'raxml':
            self.logger.info(
                'Inferring gene tree with RAxML using PROTGAMMA%s.' %
                prot_model)

            # create phylip MSA file
            phylip_msa_file = msa_file.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file)
            os.system(cmd)

            # run RAxML
            raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml'))
            tree_output_log = os.path.join(output_dir, 'raxml.log')

            raxml = RAxML(self.cpus)
            tree_unrooted_output = raxml.run(phylip_msa_file, prot_model,
                                             raxml_dir)

        # root tree at midpoint
        if not skip_rooting:
            seqs = seq_io.read(msa_file)
            if len(seqs) > 2:
                self.logger.info('Rooting tree at midpoint.')
                tree = dendropy.Tree.get_from_path(tree_unrooted_output,
                                                   schema='newick',
                                                   rooting="force-rooted",
                                                   preserve_underscores=True)
                tree.reroot_at_midpoint(update_bipartitions=False)

            tree_output = os.path.join(output_dir, prefix + '.rooted.tree')
            tree.write_to_path(tree_output,
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)
        else:
            tree_output = tree_unrooted_output

        return tree_output