def codon_usage(self, seqs, genome_id): """Calculate codon usage statistics for genome. Parameters ---------- seqs : dict[seq_id] -> seq Sequences indexed by sequence id. genome_id : str Unique id of genome used to create output file. """ # calculate codon usage for each gene and the genome as a while gene_codon_usage = defaultdict(lambda: defaultdict(int)) genome_codon_usage = defaultdict(int) gc = {} for gene_id, seq in seqs.iteritems(): gc[gene_id] = seq_tk.gc(seq) for i in xrange(0, len(seq) - 3, 3): codon = seq[i:i + 3].upper() if 'N' not in codon: gene_codon_usage[gene_id][codon] += 1 genome_codon_usage[codon] += 1 # calculate Manhattan distance for each gene manhattan_dist = self._manhattan(gene_codon_usage, genome_codon_usage) # report dinucleotide usage of each gene codon_set_sorted = sorted(genome_codon_usage.keys()) gene_ids_sorted = sorted(manhattan_dist.items(), key=operator.itemgetter(1), reverse=True) output_file = os.path.join(self.output_dir, genome_id + '.codon_usage.tsv') fout = open(output_file, 'w') fout.write('Gene Id\tGC\tLength (bp)\t# codons\tManhattan distance\tDeviations from mean') for di in codon_set_sorted: fout.write('\t' + di) fout.write('\n') genome_gc = seq_tk.gc(''.join(seqs.values())) genome_sum_codon = sum(genome_codon_usage.values()) fout.write('%s\t%.2f\t%d\t%d' % ('<complete genome>', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_codon)) fout.write('\t%.1f\t%.1f' % (0, 0)) for di in codon_set_sorted: fout.write('\t%.2f' % (genome_codon_usage.get(di, 0) * 100.0 / genome_sum_codon)) fout.write('\n') for gene_id, dists in gene_ids_sorted: codons = gene_codon_usage[gene_id] sum_codons = sum(codons.values()) fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_codons)) fout.write('\t%.2f\t%.2f' % (dists[0], dists[1])) for di in codon_set_sorted: fout.write('\t%.2f' % (codons.get(di, 0) * 100.0 / sum_codons)) fout.write('\n') fout.close()
def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file): """Calculate statistics for scaffolds. Parameters ---------- scaffold_file : str Fasta file containing scaffolds. genome_files : list of str Fasta files with binned scaffolds. tetra_file : str Tetranucleotide signatures for scaffolds. coverage_file : str Coverage profiles for scaffolds output_file : str Output file for scaffolds statistics. """ tetra = Tetranucleotide(self.cpus) signatures = tetra.read(tetra_file) cov_profiles = None if coverage_file: coverage = Coverage(self.cpus) cov_profiles, _ = coverage.read(coverage_file) # determine bin assignment for each scaffold self.logger.info('') self.logger.info(' Determining scaffold statistics.') scaffold_id_genome_id = {} for gf in genome_files: genome_id = remove_extension(gf) for scaffold_id, _seq in seq_io.read_seq(gf): scaffold_id_genome_id[scaffold_id] = genome_id # write out scaffold statistics fout = open(output_file, 'w') fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)') if cov_profiles: bam_ids = sorted(cov_profiles[cov_profiles.keys()[0]].keys()) for bam_id in bam_ids: fout.write('\t' + bam_id) for kmer in tetra.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for scaffold_id, seq in seq_io.read_seq(scaffold_file): fout.write(scaffold_id) fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned)) fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0)) fout.write('\t%d' % len(seq)) if cov_profiles: for bam_id in bam_ids: fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id]) fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id]))) fout.write('\n') fout.close()
def dinucleotide_usage(self, seqs, genome_id): """Calculate dinucleotide (n3:n1) usage statistics for genome. Parameters ---------- seqs : dict[seq_id] -> seq Sequences indexed by sequence id. genome_id : str Unique id of genome used to create output file. """ # calculate dinucleotide usage for each gene and the genome as a while gene_di_usage = defaultdict(lambda: defaultdict(int)) gene_n1 = defaultdict(lambda: defaultdict(int)) gene_n3 = defaultdict(lambda: defaultdict(int)) genome_di_usage = defaultdict(int) genome_n1 = defaultdict(int) genome_n3 = defaultdict(int) gc = {} for gene_id, seq in seqs.items(): gc[gene_id] = seq_tk.gc(seq) for i in range(2, len(seq) - 2, 3): dinucleotide = seq[i:i + 2].upper() if 'N' not in dinucleotide: gene_di_usage[gene_id][dinucleotide] += 1 gene_n3[gene_id][dinucleotide[0]] += 1 gene_n1[gene_id][dinucleotide[1]] += 1 genome_di_usage[dinucleotide] += 1 genome_n3[dinucleotide[0]] += 1 genome_n1[dinucleotide[1]] += 1 # calculate Manhattan distance for each gene manhattan_dist = self._manhattan(gene_di_usage, genome_di_usage) # identify deviant genes under Hotelling T-squared statistic t2_stats = self._hotelling_statistic(gene_di_usage, gene_n3, gene_n1, genome_di_usage, genome_n3, genome_n1) # correction from T-squared distribution to F-distribution approximation # http://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution num_genes = len(gene_di_usage) f_dist_correction = float(num_genes - 15 + 1) / (15 * num_genes) deviant_threshold = f.isf(self.critical_value, 15, num_genes - 15 + 1) / f_dist_correction # report dinucleotide usage of each gene di_set_sorted = sorted(genome_di_usage.keys()) gene_ids_sorted = sorted(t2_stats.items(), key=operator.itemgetter(1), reverse=True) output_file = os.path.join(self.output_dir, genome_id + '.di_usage.tsv') fout = open(output_file, 'w') fout.write( 'Gene Id\tGC\tLength (bp)\t# dinucleotides\tHotelling T-squared statistic\tDeviant\tManhattan distance\tDeviations from mean' ) for di in di_set_sorted: fout.write('\t' + di) fout.write('\n') genome_gc = seq_tk.gc(''.join(seqs.values())) genome_sum_di = sum(genome_di_usage.values()) fout.write('%s\t%.2f\t%d\t%d' % ('<complete genome>', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_di)) fout.write('\t%s\t%s\t%.1f\t%.1f' % ('na', 'na', 0, 0)) for di in di_set_sorted: fout.write('\t%.2f' % (genome_di_usage.get(di, 0) * 100.0 / genome_sum_di)) fout.write('\n') for gene_id, t2_stat in gene_ids_sorted: dinucleotides = gene_di_usage[gene_id] sum_di = sum(dinucleotides.values()) fout.write( '%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_di)) fout.write( '\t%.2f\t%s' % (t2_stat, 'yes' if t2_stat > deviant_threshold else 'no')) fout.write( '\t%.2f\t%.2f' % (manhattan_dist[gene_id][0], manhattan_dist[gene_id][1])) for di in di_set_sorted: fout.write('\t%.2f' % (dinucleotides.get(di, 0) * 100.0 / sum_di)) fout.write('\n') fout.close()
def dinucleotide_usage(self, seqs, genome_id): """Calculate dinucleotide (n3:n1) usage statistics for genome. Parameters ---------- seqs : dict[seq_id] -> seq Sequences indexed by sequence id. genome_id : str Unique id of genome used to create output file. """ # calculate dinucleotide usage for each gene and the genome as a while gene_di_usage = defaultdict(lambda: defaultdict(int)) gene_n1 = defaultdict(lambda: defaultdict(int)) gene_n3 = defaultdict(lambda: defaultdict(int)) genome_di_usage = defaultdict(int) genome_n1 = defaultdict(int) genome_n3 = defaultdict(int) gc = {} for gene_id, seq in seqs.items(): gc[gene_id] = seq_tk.gc(seq) for i in xrange(2, len(seq) - 2, 3): dinucleotide = seq[i:i + 2].upper() if 'N' not in dinucleotide: gene_di_usage[gene_id][dinucleotide] += 1 gene_n3[gene_id][dinucleotide[0]] += 1 gene_n1[gene_id][dinucleotide[1]] += 1 genome_di_usage[dinucleotide] += 1 genome_n3[dinucleotide[0]] += 1 genome_n1[dinucleotide[1]] += 1 # calculate Manhattan distance for each gene manhattan_dist = self._manhattan(gene_di_usage, genome_di_usage) # identify deviant genes under Hotelling T-squared statistic t2_stats = self._hotelling_statistic(gene_di_usage, gene_n3, gene_n1, genome_di_usage, genome_n3, genome_n1) # correction from T-squared distribution to F-distribution approximation # http://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution num_genes = len(gene_di_usage) f_dist_correction = float(num_genes - 15 + 1) / (15 * num_genes) deviant_threshold = f.isf(self.critical_value, 15, num_genes - 15 + 1) / f_dist_correction # report dinucleotide usage of each gene di_set_sorted = sorted(genome_di_usage.keys()) gene_ids_sorted = sorted(t2_stats.items(), key=operator.itemgetter(1), reverse=True) output_file = os.path.join(self.output_dir, genome_id + '.di_usage.tsv') fout = open(output_file, 'w') fout.write('Gene Id\tGC\tLength (bp)\t# dinucleotides\tHotelling T-squared statistic\tDeviant\tManhattan distance\tDeviations from mean') for di in di_set_sorted: fout.write('\t' + di) fout.write('\n') genome_gc = seq_tk.gc(''.join(seqs.values())) genome_sum_di = sum(genome_di_usage.values()) fout.write('%s\t%.2f\t%d\t%d' % ('<complete genome>', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_di)) fout.write('\t%s\t%s\t%.1f\t%.1f' % ('na', 'na', 0, 0)) for di in di_set_sorted: fout.write('\t%.2f' % (genome_di_usage.get(di, 0) * 100.0 / genome_sum_di)) fout.write('\n') for gene_id, t2_stat in gene_ids_sorted: dinucleotides = gene_di_usage[gene_id] sum_di = sum(dinucleotides.values()) fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_di)) fout.write('\t%.2f\t%s' % (t2_stat, 'yes' if t2_stat > deviant_threshold else 'no')) fout.write('\t%.2f\t%.2f' % (manhattan_dist[gene_id][0], manhattan_dist[gene_id][1])) for di in di_set_sorted: fout.write('\t%.2f' % (dinucleotides.get(di, 0) * 100.0 / sum_di)) fout.write('\n') fout.close()
def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file): """Calculate statistics for scaffolds. Parameters ---------- scaffold_file : str Fasta file containing scaffolds. genome_files : list of str Fasta files with binned scaffolds. tetra_file : str Tetranucleotide signatures for scaffolds. coverage_file : str Coverage profiles for scaffolds output_file : str Output file for scaffolds statistics. """ tetra = Tetranucleotide(self.cpus) signatures = tetra.read(tetra_file) cov_profiles = None if coverage_file: coverage = Coverage(self.cpus) cov_profiles, _ = coverage.read(coverage_file) # determine bin assignment for each scaffold self.logger.info('Determining scaffold statistics.') scaffold_id_genome_id = {} for gf in genome_files: genome_id = remove_extension(gf) for scaffold_id, _seq in seq_io.read_seq(gf): scaffold_id_genome_id[scaffold_id] = genome_id # write out scaffold statistics fout = open(output_file, 'w') fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)') if cov_profiles: first_key = list(cov_profiles.keys())[0] bam_ids = sorted(cov_profiles[first_key].keys()) for bam_id in bam_ids: fout.write('\t' + bam_id) for kmer in tetra.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for scaffold_id, seq in seq_io.read_seq(scaffold_file): fout.write(scaffold_id) fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned)) fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0)) fout.write('\t%d' % len(seq)) if cov_profiles: for bam_id in bam_ids: fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id]) fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id]))) fout.write('\n') fout.close()