コード例 #1
0
    def codon_usage(self, seqs, genome_id):
        """Calculate codon usage statistics for genome.

        Parameters
        ----------
        seqs : dict[seq_id] -> seq
            Sequences indexed by sequence id.
        genome_id : str
            Unique id of genome used to create output file.
        """

        # calculate codon usage for each gene and the genome as a while
        gene_codon_usage = defaultdict(lambda: defaultdict(int))
        genome_codon_usage = defaultdict(int)
        gc = {}
        for gene_id, seq in seqs.iteritems():
            gc[gene_id] = seq_tk.gc(seq)

            for i in xrange(0, len(seq) - 3, 3):
                codon = seq[i:i + 3].upper()
                if 'N' not in codon:
                    gene_codon_usage[gene_id][codon] += 1
                    genome_codon_usage[codon] += 1

        # calculate Manhattan distance for each gene
        manhattan_dist = self._manhattan(gene_codon_usage, genome_codon_usage)

        # report dinucleotide usage of each gene
        codon_set_sorted = sorted(genome_codon_usage.keys())
        gene_ids_sorted = sorted(manhattan_dist.items(), key=operator.itemgetter(1), reverse=True)

        output_file = os.path.join(self.output_dir, genome_id + '.codon_usage.tsv')
        fout = open(output_file, 'w')

        fout.write('Gene Id\tGC\tLength (bp)\t# codons\tManhattan distance\tDeviations from mean')
        for di in codon_set_sorted:
            fout.write('\t' + di)
        fout.write('\n')

        genome_gc = seq_tk.gc(''.join(seqs.values()))
        genome_sum_codon = sum(genome_codon_usage.values())
        fout.write('%s\t%.2f\t%d\t%d' % ('<complete genome>', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_codon))
        fout.write('\t%.1f\t%.1f' % (0, 0))
        for di in codon_set_sorted:
            fout.write('\t%.2f' % (genome_codon_usage.get(di, 0) * 100.0 / genome_sum_codon))
        fout.write('\n')

        for gene_id, dists in gene_ids_sorted:
            codons = gene_codon_usage[gene_id]
            sum_codons = sum(codons.values())
            fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_codons))
            fout.write('\t%.2f\t%.2f' % (dists[0], dists[1]))

            for di in codon_set_sorted:
                fout.write('\t%.2f' % (codons.get(di, 0) * 100.0 / sum_codons))
            fout.write('\n')
        fout.close()
コード例 #2
0
ファイル: lgt_codon.py プロジェクト: ctSkennerton/CompareM
    def codon_usage(self, seqs, genome_id):
        """Calculate codon usage statistics for genome.

        Parameters
        ----------
        seqs : dict[seq_id] -> seq
            Sequences indexed by sequence id.
        genome_id : str
            Unique id of genome used to create output file.
        """

        # calculate codon usage for each gene and the genome as a while
        gene_codon_usage = defaultdict(lambda: defaultdict(int))
        genome_codon_usage = defaultdict(int)
        gc = {}
        for gene_id, seq in seqs.iteritems():
            gc[gene_id] = seq_tk.gc(seq)

            for i in xrange(0, len(seq) - 3, 3):
                codon = seq[i:i + 3].upper()
                if 'N' not in codon:
                    gene_codon_usage[gene_id][codon] += 1
                    genome_codon_usage[codon] += 1

        # calculate Manhattan distance for each gene
        manhattan_dist = self._manhattan(gene_codon_usage, genome_codon_usage)

        # report dinucleotide usage of each gene
        codon_set_sorted = sorted(genome_codon_usage.keys())
        gene_ids_sorted = sorted(manhattan_dist.items(), key=operator.itemgetter(1), reverse=True)

        output_file = os.path.join(self.output_dir, genome_id + '.codon_usage.tsv')
        fout = open(output_file, 'w')

        fout.write('Gene Id\tGC\tLength (bp)\t# codons\tManhattan distance\tDeviations from mean')
        for di in codon_set_sorted:
            fout.write('\t' + di)
        fout.write('\n')

        genome_gc = seq_tk.gc(''.join(seqs.values()))
        genome_sum_codon = sum(genome_codon_usage.values())
        fout.write('%s\t%.2f\t%d\t%d' % ('<complete genome>', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_codon))
        fout.write('\t%.1f\t%.1f' % (0, 0))
        for di in codon_set_sorted:
            fout.write('\t%.2f' % (genome_codon_usage.get(di, 0) * 100.0 / genome_sum_codon))
        fout.write('\n')

        for gene_id, dists in gene_ids_sorted:
            codons = gene_codon_usage[gene_id]
            sum_codons = sum(codons.values())
            fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_codons))
            fout.write('\t%.2f\t%.2f' % (dists[0], dists[1]))

            for di in codon_set_sorted:
                fout.write('\t%.2f' % (codons.get(di, 0) * 100.0 / sum_codons))
            fout.write('\n')
        fout.close()
コード例 #3
0
ファイル: scaffold_stats.py プロジェクト: AlexRBaker/RefineM
    def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file):
        """Calculate statistics for scaffolds.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds.
        genome_files : list of str
            Fasta files with binned scaffolds.
        tetra_file : str
            Tetranucleotide signatures for scaffolds.
        coverage_file : str
            Coverage profiles for scaffolds
        output_file : str
            Output file for scaffolds statistics.
        """

        tetra = Tetranucleotide(self.cpus)
        signatures = tetra.read(tetra_file)

        cov_profiles = None
        if coverage_file:
            coverage = Coverage(self.cpus)
            cov_profiles, _ = coverage.read(coverage_file)

        # determine bin assignment for each scaffold
        self.logger.info('')
        self.logger.info('  Determining scaffold statistics.')

        scaffold_id_genome_id = {}
        for gf in genome_files:
            genome_id = remove_extension(gf)
            for scaffold_id, _seq in seq_io.read_seq(gf):
                scaffold_id_genome_id[scaffold_id] = genome_id

        # write out scaffold statistics
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            bam_ids = sorted(cov_profiles[cov_profiles.keys()[0]].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for scaffold_id, seq in seq_io.read_seq(scaffold_file):
            fout.write(scaffold_id)
            fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned))
            fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
            fout.write('\t%d' % len(seq))

            if cov_profiles:
                for bam_id in bam_ids:
                    fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])

            fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
            fout.write('\n')

        fout.close()
コード例 #4
0
ファイル: lgt_dinucleotide.py プロジェクト: swmuyu/CompareM
    def dinucleotide_usage(self, seqs, genome_id):
        """Calculate dinucleotide (n3:n1) usage statistics for genome.

        Parameters
        ----------
        seqs : dict[seq_id] -> seq
            Sequences indexed by sequence id.
        genome_id : str
            Unique id of genome used to create output file.
        """

        # calculate dinucleotide usage for each gene and the genome as a while
        gene_di_usage = defaultdict(lambda: defaultdict(int))
        gene_n1 = defaultdict(lambda: defaultdict(int))
        gene_n3 = defaultdict(lambda: defaultdict(int))

        genome_di_usage = defaultdict(int)
        genome_n1 = defaultdict(int)
        genome_n3 = defaultdict(int)
        gc = {}
        for gene_id, seq in seqs.items():
            gc[gene_id] = seq_tk.gc(seq)

            for i in range(2, len(seq) - 2, 3):
                dinucleotide = seq[i:i + 2].upper()
                if 'N' not in dinucleotide:
                    gene_di_usage[gene_id][dinucleotide] += 1
                    gene_n3[gene_id][dinucleotide[0]] += 1
                    gene_n1[gene_id][dinucleotide[1]] += 1

                    genome_di_usage[dinucleotide] += 1
                    genome_n3[dinucleotide[0]] += 1
                    genome_n1[dinucleotide[1]] += 1

        # calculate Manhattan distance for each gene
        manhattan_dist = self._manhattan(gene_di_usage, genome_di_usage)

        # identify deviant genes under Hotelling T-squared statistic
        t2_stats = self._hotelling_statistic(gene_di_usage, gene_n3, gene_n1,
                                             genome_di_usage, genome_n3,
                                             genome_n1)

        # correction from T-squared distribution to F-distribution approximation
        # http://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution
        num_genes = len(gene_di_usage)
        f_dist_correction = float(num_genes - 15 + 1) / (15 * num_genes)
        deviant_threshold = f.isf(self.critical_value, 15,
                                  num_genes - 15 + 1) / f_dist_correction

        # report dinucleotide usage of each gene
        di_set_sorted = sorted(genome_di_usage.keys())
        gene_ids_sorted = sorted(t2_stats.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)

        output_file = os.path.join(self.output_dir,
                                   genome_id + '.di_usage.tsv')
        fout = open(output_file, 'w')

        fout.write(
            'Gene Id\tGC\tLength (bp)\t# dinucleotides\tHotelling T-squared statistic\tDeviant\tManhattan distance\tDeviations from mean'
        )
        for di in di_set_sorted:
            fout.write('\t' + di)
        fout.write('\n')

        genome_gc = seq_tk.gc(''.join(seqs.values()))
        genome_sum_di = sum(genome_di_usage.values())
        fout.write('%s\t%.2f\t%d\t%d' %
                   ('<complete genome>', genome_gc * 100.0,
                    sum([len(x) for x in seqs.values()]), genome_sum_di))
        fout.write('\t%s\t%s\t%.1f\t%.1f' % ('na', 'na', 0, 0))
        for di in di_set_sorted:
            fout.write('\t%.2f' %
                       (genome_di_usage.get(di, 0) * 100.0 / genome_sum_di))
        fout.write('\n')

        for gene_id, t2_stat in gene_ids_sorted:
            dinucleotides = gene_di_usage[gene_id]
            sum_di = sum(dinucleotides.values())
            fout.write(
                '%s\t%.2f\t%d\t%d' %
                (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_di))
            fout.write(
                '\t%.2f\t%s' %
                (t2_stat, 'yes' if t2_stat > deviant_threshold else 'no'))
            fout.write(
                '\t%.2f\t%.2f' %
                (manhattan_dist[gene_id][0], manhattan_dist[gene_id][1]))

            for di in di_set_sorted:
                fout.write('\t%.2f' %
                           (dinucleotides.get(di, 0) * 100.0 / sum_di))
            fout.write('\n')
        fout.close()
コード例 #5
0
    def dinucleotide_usage(self, seqs, genome_id):
        """Calculate dinucleotide (n3:n1) usage statistics for genome.

        Parameters
        ----------
        seqs : dict[seq_id] -> seq
            Sequences indexed by sequence id.
        genome_id : str
            Unique id of genome used to create output file.
        """

        # calculate dinucleotide usage for each gene and the genome as a while
        gene_di_usage = defaultdict(lambda: defaultdict(int))
        gene_n1 = defaultdict(lambda: defaultdict(int))
        gene_n3 = defaultdict(lambda: defaultdict(int))

        genome_di_usage = defaultdict(int)
        genome_n1 = defaultdict(int)
        genome_n3 = defaultdict(int)
        gc = {}
        for gene_id, seq in seqs.items():
            gc[gene_id] = seq_tk.gc(seq)

            for i in xrange(2, len(seq) - 2, 3):
                dinucleotide = seq[i:i + 2].upper()
                if 'N' not in dinucleotide:
                    gene_di_usage[gene_id][dinucleotide] += 1
                    gene_n3[gene_id][dinucleotide[0]] += 1
                    gene_n1[gene_id][dinucleotide[1]] += 1

                    genome_di_usage[dinucleotide] += 1
                    genome_n3[dinucleotide[0]] += 1
                    genome_n1[dinucleotide[1]] += 1

        # calculate Manhattan distance for each gene
        manhattan_dist = self._manhattan(gene_di_usage, genome_di_usage)

        # identify deviant genes under Hotelling T-squared statistic
        t2_stats = self._hotelling_statistic(gene_di_usage, gene_n3, gene_n1,
                                                genome_di_usage, genome_n3, genome_n1)

        # correction from T-squared distribution to F-distribution approximation
        # http://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution
        num_genes = len(gene_di_usage)
        f_dist_correction = float(num_genes - 15 + 1) / (15 * num_genes)
        deviant_threshold = f.isf(self.critical_value, 15, num_genes - 15 + 1) / f_dist_correction

        # report dinucleotide usage of each gene
        di_set_sorted = sorted(genome_di_usage.keys())
        gene_ids_sorted = sorted(t2_stats.items(), key=operator.itemgetter(1), reverse=True)

        output_file = os.path.join(self.output_dir, genome_id + '.di_usage.tsv')
        fout = open(output_file, 'w')

        fout.write('Gene Id\tGC\tLength (bp)\t# dinucleotides\tHotelling T-squared statistic\tDeviant\tManhattan distance\tDeviations from mean')
        for di in di_set_sorted:
            fout.write('\t' + di)
        fout.write('\n')

        genome_gc = seq_tk.gc(''.join(seqs.values()))
        genome_sum_di = sum(genome_di_usage.values())
        fout.write('%s\t%.2f\t%d\t%d' % ('<complete genome>', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_di))
        fout.write('\t%s\t%s\t%.1f\t%.1f' % ('na', 'na', 0, 0))
        for di in di_set_sorted:
            fout.write('\t%.2f' % (genome_di_usage.get(di, 0) * 100.0 / genome_sum_di))
        fout.write('\n')

        for gene_id, t2_stat in gene_ids_sorted:
            dinucleotides = gene_di_usage[gene_id]
            sum_di = sum(dinucleotides.values())
            fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_di))
            fout.write('\t%.2f\t%s' % (t2_stat, 'yes' if t2_stat > deviant_threshold else 'no'))
            fout.write('\t%.2f\t%.2f' % (manhattan_dist[gene_id][0], manhattan_dist[gene_id][1]))

            for di in di_set_sorted:
                fout.write('\t%.2f' % (dinucleotides.get(di, 0) * 100.0 / sum_di))
            fout.write('\n')
        fout.close()
コード例 #6
0
    def run(self, scaffold_file, genome_files, tetra_file, coverage_file,
            output_file):
        """Calculate statistics for scaffolds.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds.
        genome_files : list of str
            Fasta files with binned scaffolds.
        tetra_file : str
            Tetranucleotide signatures for scaffolds.
        coverage_file : str
            Coverage profiles for scaffolds
        output_file : str
            Output file for scaffolds statistics.
        """

        tetra = Tetranucleotide(self.cpus)
        signatures = tetra.read(tetra_file)

        cov_profiles = None
        if coverage_file:
            coverage = Coverage(self.cpus)
            cov_profiles, _ = coverage.read(coverage_file)

        # determine bin assignment for each scaffold
        self.logger.info('Determining scaffold statistics.')

        scaffold_id_genome_id = {}
        for gf in genome_files:
            genome_id = remove_extension(gf)
            for scaffold_id, _seq in seq_io.read_seq(gf):
                scaffold_id_genome_id[scaffold_id] = genome_id

        # write out scaffold statistics
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            first_key = list(cov_profiles.keys())[0]
            bam_ids = sorted(cov_profiles[first_key].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for scaffold_id, seq in seq_io.read_seq(scaffold_file):
            fout.write(scaffold_id)
            fout.write('\t' +
                       scaffold_id_genome_id.get(scaffold_id, self.unbinned))
            fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
            fout.write('\t%d' % len(seq))

            if cov_profiles:
                for bam_id in bam_ids:
                    fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])

            fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
            fout.write('\n')

        fout.close()