def generate_variabile_codons_table(self): if self.skip_SNV_profiling or not self.profile_SCVs: return variable_codons_table = TableForCodonFrequencies( self.profile_db_path, progress=self.progress) codon_frequencies = bamops.CodonFrequencies() codons_in_genes_to_profile_SCVs_dict = {} for gene_callers_id, codon_order in self.codons_in_genes_to_profile_SCVs: if gene_callers_id not in codons_in_genes_to_profile_SCVs_dict: codons_in_genes_to_profile_SCVs_dict[gene_callers_id] = set([]) codons_in_genes_to_profile_SCVs_dict[gene_callers_id].add( codon_order) gene_caller_ids_to_profile = list( codons_in_genes_to_profile_SCVs_dict.keys()) for i in range(len(gene_caller_ids_to_profile)): gene_callers_id = gene_caller_ids_to_profile[i] codons_to_profile = codons_in_genes_to_profile_SCVs_dict[ gene_callers_id] gene_call = self.genes_in_contigs_dict[gene_callers_id] contig_name = gene_call['contig'] codon_frequencies_dict = codon_frequencies.process_gene_call( self.bam, gene_call, self.contig_sequences[contig_name]['sequence'], codons_to_profile) for codon_order in codon_frequencies_dict: e = codon_frequencies_dict[codon_order] db_entry = { 'sample_id': self.sample_id, 'corresponding_gene_call': gene_callers_id } db_entry['reference'] = e['reference'] db_entry['coverage'] = e['coverage'] db_entry['departure_from_reference'] = e[ 'departure_from_reference'] db_entry['codon_order_in_gene'] = codon_order for codon in list(constants.codon_to_AA.keys()): db_entry[codon] = e['frequencies'][codon] variable_codons_table.append(db_entry) variable_codons_table.store() # clear contents of set self.codons_in_genes_to_profile_SCVs.clear() if len(codon_frequencies.not_reported_items): items = codon_frequencies.not_reported_items self.run.warning( "The profiler of single-codon variants failed to report anything for a\ total of %d items, because they looked weird to anvi'o :( Here is a list\ of those that did ended up being ignored: '%s'." % (len(items), ', '.join(items)))
def profile_contig_worker(self, available_index_queue, output_queue): bam_file = pysam.Samfile(self.input_file_path, 'rb') while True: index = available_index_queue.get(True) contig_name = self.contig_names[index] contig = contigops.Contig(contig_name) contig.length = self.contig_lengths[index] contig.split_length = self.a_meta['split_length'] contig.min_coverage_for_variability = self.min_coverage_for_variability contig.skip_SNV_profiling = self.skip_SNV_profiling contig.report_variability_full = self.report_variability_full contig.ignore_orphans = not self.include_orphans contig.max_coverage_depth = self.max_coverage_depth # populate contig with empty split objects and for split_name in self.contig_name_to_splits[contig_name]: s = self.splits_basic_info[split_name] split_sequence = self.contig_sequences[contig_name][ 'sequence'][s['start']:s['end']] split = contigops.Split(split_name, split_sequence, contig_name, s['order_in_parent'], s['start'], s['end']) contig.splits.append(split) # analyze coverage for each split contig.analyze_coverage(bam_file) # test the mean coverage of the contig. if contig.coverage.mean < self.min_mean_coverage: output_queue.put(None) continue if not self.skip_SNV_profiling: contig.analyze_auxiliary(bam_file) codons_in_genes_to_profile_SCVs = set([]) for split in contig.splits: for column_profile in list(split.column_profiles.values()): pos_in_contig = column_profile['pos_in_contig'] column_profile['in_partial_gene_call'], \ column_profile['in_complete_gene_call'],\ column_profile['base_pos_in_codon'] = self.get_nt_position_info(contig.name, pos_in_contig) column_profile['sample_id'] = self.sample_id column_profile[ 'corresponding_gene_call'] = -1 # this means there is no gene call that corresponds to this # nt position, which will be updated in the following lines. # yeah, we use '-1', because genecaller ids start from 0 :/ column_profile['codon_order_in_gene'] = -1 # if this particular position (`pos_in_contig`) falls within a COMPLETE gene call, # we would like to find out which unique gene caller id(s) match to this position. if column_profile['in_complete_gene_call']: corresponding_gene_caller_ids = self.get_corresponding_gene_caller_ids_for_base_position( contig.name, pos_in_contig) # if there are more than one corresponding gene call, this usually indicates an assembly error # just to be on the safe side, we will not report a corresopnding unique gene callers id for this # position if len(corresponding_gene_caller_ids) == 1: # if we are here, it means this nucleotide position is in a complete gene call. we will do two things here. # first, we will store the gene_callers_id that corresponds to this nt position, and then we will store the # order of the corresponding codon in the gene for this nt position. gene_callers_id = corresponding_gene_caller_ids[ 0] column_profile[ 'corresponding_gene_call'] = gene_callers_id column_profile[ 'codon_order_in_gene'] = self.get_corresponding_codon_order_in_gene( gene_callers_id, contig.name, pos_in_contig) # save this information for later use codons_in_genes_to_profile_SCVs.add( (gene_callers_id, column_profile['codon_order_in_gene']), ) codon_frequencies = bamops.CodonFrequencies() codons_in_genes_to_profile_SCVs_dict = {} for gene_callers_id, codon_order in codons_in_genes_to_profile_SCVs: if gene_callers_id not in codons_in_genes_to_profile_SCVs_dict: codons_in_genes_to_profile_SCVs_dict[ gene_callers_id] = set([]) codons_in_genes_to_profile_SCVs_dict[gene_callers_id].add( codon_order) gene_caller_ids_to_profile = list( codons_in_genes_to_profile_SCVs_dict.keys()) for i in range(len(gene_caller_ids_to_profile)): gene_callers_id = gene_caller_ids_to_profile[i] codons_to_profile = codons_in_genes_to_profile_SCVs_dict[ gene_callers_id] gene_call = self.genes_in_contigs_dict[gene_callers_id] contig_name = gene_call['contig'] contig.codon_frequencies_dict[ gene_callers_id] = codon_frequencies.process_gene_call( bam_file, gene_call, self.contig_sequences[contig_name]['sequence'], codons_to_profile) output_queue.put(contig) for split in contig.splits: del split.coverage del split.auxiliary del split del contig.splits[:] del contig.coverage del contig # we are closing this object here for clarity, although w # are not really closing it since the code never reaches here # and the worker is killed by its parent: bam_file.close() return