def insert_additional_fields(self, keys=[]): if not len(keys): keys = self.data.keys() for key in keys: e = self.data[key] if self.engine == 'NT': freqs_list = sorted([(e[nt], nt) for nt in 'ATCGN'], reverse=True) elif self.engine == 'AA': aas = set(codon_to_AA.values()) freqs_list = sorted([(e[aa], aa) for aa in aas], reverse=True) frequency_of_consensus = freqs_list[0][0] e['n2n1ratio'] = freqs_list[1][0] / frequency_of_consensus if frequency_of_consensus else -1 e['consensus'] = freqs_list[0][1] total_frequency_of_all_but_the_consensus = sum([tpl[0] for tpl in freqs_list[1:]]) coverage = total_frequency_of_all_but_the_consensus + frequency_of_consensus e['departure_from_consensus'] = total_frequency_of_all_but_the_consensus / coverage if coverage else -1
def insert_additional_fields(self, keys=[]): if not len(keys): keys = self.data.keys() for key in keys: e = self.data[key] if self.engine == 'NT': freqs_list = sorted([(e[nt], nt) for nt in 'ATCGN'], reverse=True) elif self.engine == 'AA': aas = set(codon_to_AA.values()) freqs_list = sorted([(e[aa], aa) for aa in aas], reverse=True) frequency_of_consensus = freqs_list[0][0] e['n2n1ratio'] = freqs_list[1][ 0] / frequency_of_consensus if frequency_of_consensus else -1 e['consensus'] = freqs_list[0][1] total_frequency_of_all_but_the_consensus = sum( [tpl[0] for tpl in freqs_list[1:]]) coverage = total_frequency_of_all_but_the_consensus + frequency_of_consensus e['departure_from_consensus'] = total_frequency_of_all_but_the_consensus / coverage if coverage else -1
# #################################################################################################### clusterings_table_name = 'clusterings' clusterings_table_structure = ['clustering', 'newick'] clusterings_table_types = ['str', 'str'] states_table_name = 'states' states_table_structure = ['name', 'content', 'last_modified'] states_table_types = ['text', 'text', 'text'] variable_aas_table_name = 'variable_amino_acid_frequencies' variable_aas_table_structure = [ 'entry_id', 'sample_id', 'corresponding_gene_call', 'codon_order_in_gene', 'consensus', 'departure_from_consensus', 'coverage' ] + sorted(list(set(codon_to_AA.values()))) variable_aas_table_types = [ 'numeric', 'text', 'numeric', 'numeric', 'text', 'numeric', 'numeric' ] + ['numeric'] * len(list(set(codon_to_AA.values()))) variable_nts_table_name = 'variable_nucleotide_positions' variable_nts_table_structure = [ 'entry_id', 'sample_id', 'split_name', 'pos', 'pos_in_contig', 'corresponding_gene_call', 'in_partial_gene_call', 'in_complete_gene_call', 'base_pos_in_codon', 'codon_order_in_gene', 'coverage', 'cov_outlier_in_split', 'cov_outlier_in_contig', 'departure_from_consensus', 'competing_nts', 'consensus', 'A', 'T', 'C', 'G', 'N' ] variable_nts_table_types = [ 'numeric', 'text', 'text', 'numeric', 'numeric', 'numeric', 'numeric',
#################################################################################################### # # TABLE DESCRIPTIONS FOR THE PROFILE DATABASE # #################################################################################################### clusterings_table_name = 'clusterings' clusterings_table_structure = ['clustering', 'newick'] clusterings_table_types = [ 'str' , 'str' ] states_table_name = 'states' states_table_structure = ['name', 'content', 'last_modified'] states_table_types = ['text', 'text' , 'text' ] variable_aas_table_name = 'variable_amino_acid_frequencies' variable_aas_table_structure = ['entry_id', 'sample_id', 'corresponding_gene_call', 'codon_order_in_gene', 'reference', 'departure_from_reference', 'coverage'] + sorted(list(set(codon_to_AA.values()))) variable_aas_table_types = [ 'numeric', 'text' , 'numeric' , 'numeric' , 'text' , 'numeric' , 'numeric' ] + ['numeric'] * len(list(set(codon_to_AA.values()))) variable_nts_table_name = 'variable_nucleotide_positions' variable_nts_table_structure = ['entry_id', 'sample_id', 'split_name', 'pos' , 'pos_in_contig', 'corresponding_gene_call', 'in_partial_gene_call', 'in_complete_gene_call', 'base_pos_in_codon', 'codon_order_in_gene', 'coverage', 'cov_outlier_in_split', 'cov_outlier_in_contig', 'departure_from_reference', 'competing_nts', 'reference', 'A' , 'T' , 'C' , 'G' , 'N' ] variable_nts_table_types = [ 'numeric', 'text' , 'text' , 'numeric', 'numeric' , 'numeric' , 'numeric' , 'numeric' , 'numeric' , 'numeric' , 'numeric' , 'bool' , 'bool' , 'numeric' , 'text' , 'text' , 'numeric', 'numeric', 'numeric', 'numeric', 'numeric'] gene_coverages_table_name = 'gene_coverages' gene_coverages_table_structure = ['entry_id', 'gene_callers_id', 'sample_id', 'mean_coverage'] gene_coverages_table_types = [ 'numeric', 'numeric' , 'text' , 'numeric' ] views_table_name = 'views' views_table_structure = ['view_id', 'target_table'] views_table_types = [ 'str' , 'str' ] # notice that atomic data table is the only table that doesn't have a name. because how we use this table is a bit tricky.
def recover_base_frequencies_for_all_samples(self): self.progress.new('Recovering AA frequencies for all') samples_wanted = self.samples_of_interest if self.samples_of_interest else self.sample_ids splits_wanted = self.splits_of_interest if self.splits_of_interest else set( self.splits_basic_info.keys()) next_available_entry_id = max(self.data.keys()) + 1 unique_pos_identifier_str_to_consenus_codon = {} unique_pos_identifier_str_to_unique_pos_identifier = {} for e in self.data.values(): upi = e['unique_pos_identifier_str'] unique_pos_identifier_str_to_consenus_codon[upi] = e['consensus'] unique_pos_identifier_str_to_unique_pos_identifier[upi] = e[ 'unique_pos_identifier'] self.progress.update( 'creating a dict to track missing AA frequencies for each sample / split / pos' ) splits_to_consider = {} for split_name in splits_wanted: splits_to_consider[split_name] = {} self.progress.update( 'populating the dict to track missing AA frequencies for each sample / split / pos' ) for entry_id in self.data: v = self.data[entry_id] gene_codon_key = '%d_%d' % (v['corresponding_gene_call'], v['codon_order_in_gene']) d = splits_to_consider[v['split_name']] if d.has_key(gene_codon_key): d[gene_codon_key].remove(v['sample_id']) else: d[gene_codon_key] = copy.deepcopy(samples_wanted) d[gene_codon_key].remove(v['sample_id']) counter = 0 for split_name in splits_to_consider: counter += 1 self.progress.update( 'accessing split coverages and updating variable positions dict :: %s' % pp(counter)) split_coverage_across_samples = self.merged_split_coverage_values.get( split_name) split_info = self.splits_basic_info[split_name] contig_name = split_info['parent'] for gene_codon_key in splits_to_consider[split_name]: corresponding_gene_call, codon_order_in_gene = [ int(k) for k in gene_codon_key.split('_') ] for sample_name in splits_to_consider[split_name][ gene_codon_key]: unique_pos_identifier_str = '_'.join([ split_name, str(corresponding_gene_call), str(codon_order_in_gene) ]) consensus_codon = unique_pos_identifier_str_to_consenus_codon[ unique_pos_identifier_str] self.data[next_available_entry_id] = { 'unique_pos_identifier_str': unique_pos_identifier_str, 'unique_pos_identifier': unique_pos_identifier_str_to_unique_pos_identifier[ unique_pos_identifier_str], 'sample_id': sample_name, 'split_name': split_name, 'contig_name': contig_name, 'corresponding_gene_call': corresponding_gene_call, 'codon_order_in_gene': codon_order_in_gene, 'departure_from_consensus': 0, 'coverage': None, 'consensus': consensus_codon } # DEALING WITH COVERAGE ################################################################## # some very cool but expensive shit is going on here, let me break it down for poor souls of the future. # what we want to do is to learn the coverage of this codon in the sample. all we have is the corresponding # gene call id, and the order of this codon in the gene. so here how it goes: # # learn the gene call gene_call = self.genes_in_contigs_dict[ corresponding_gene_call] # the following dict converts codon orders into nt positions in contig for a geven gene call codon_order_to_nt_positions_in_contig = utils.get_codon_order_to_nt_positions_dict( gene_call) # so the nucleotide positions for this codon in the contig is the following: nt_positions_for_codon_in_contig = codon_order_to_nt_positions_in_contig[ codon_order_in_gene] # but we need to convert those positions to the context of this split. so here is the start pos: split_start = self.splits_basic_info[split_name]['start'] # here we map nt positions from the contig context to split context using the start position nt_positions_for_codon_in_split = [ p - split_start for p in nt_positions_for_codon_in_contig ] # we acquire coverages that match to these positions coverages = split_coverage_across_samples[sample_name][ nt_positions_for_codon_in_split] coverage = int(round(sum(coverages) / 3)) # and finally update the data table self.data[next_available_entry_id]['coverage'] = coverage # DEALING WITH AAs ################################################################## # here we need to put all the codons into the data table for this sample for codon in set(codon_to_AA.values()): self.data[next_available_entry_id][codon] = 0 # and finally update the frequency of the consensus codon with the coverage (WHICH IS VERY BAD, # WE HAVE NO CLUE WHAT IS THE ACTUAL COVERAGE OF TRIPLICATE LINKMERS): self.data[next_available_entry_id][ consensus_codon] = coverage next_available_entry_id += 1 self.progress.end()
def recover_base_frequencies_for_all_samples(self): self.progress.new('Recovering AA frequencies for all') samples_wanted = self.samples_of_interest if self.samples_of_interest else self.sample_ids splits_wanted = self.splits_of_interest if self.splits_of_interest else set(self.splits_basic_info.keys()) next_available_entry_id = max(self.data.keys()) + 1 unique_pos_identifier_str_to_consenus_codon = {} unique_pos_identifier_str_to_unique_pos_identifier = {} for e in self.data.values(): upi = e['unique_pos_identifier_str'] unique_pos_identifier_str_to_consenus_codon[upi] = e['reference'] unique_pos_identifier_str_to_unique_pos_identifier[upi] = e['unique_pos_identifier'] self.progress.update('creating a dict to track missing AA frequencies for each sample / split / pos') splits_to_consider = {} for split_name in splits_wanted: splits_to_consider[split_name] = {} self.progress.update('populating the dict to track missing AA frequencies for each sample / split / pos') for entry_id in self.data: v = self.data[entry_id] gene_codon_key = '%d_%d' % (v['corresponding_gene_call'], v['codon_order_in_gene']) d = splits_to_consider[v['split_name']] if gene_codon_key in d: d[gene_codon_key].remove(v['sample_id']) else: d[gene_codon_key] = copy.deepcopy(samples_wanted) d[gene_codon_key].remove(v['sample_id']) counter = 0 for split_name in splits_to_consider: counter += 1 self.progress.update('accessing split coverages and updating variable positions dict :: %s' % pp(counter)) split_coverage_across_samples = self.merged_split_coverage_values.get(split_name) split_info = self.splits_basic_info[split_name] contig_name = split_info['parent'] for gene_codon_key in splits_to_consider[split_name]: corresponding_gene_call, codon_order_in_gene = [int(k) for k in gene_codon_key.split('_')] for sample_name in splits_to_consider[split_name][gene_codon_key]: unique_pos_identifier_str = '_'.join([split_name, str(corresponding_gene_call), str(codon_order_in_gene)]) reference_codon = unique_pos_identifier_str_to_consenus_codon[unique_pos_identifier_str] self.data[next_available_entry_id] = {'unique_pos_identifier_str': unique_pos_identifier_str, 'unique_pos_identifier': unique_pos_identifier_str_to_unique_pos_identifier[unique_pos_identifier_str], 'sample_id': sample_name, 'split_name': split_name, 'contig_name': contig_name, 'corresponding_gene_call': corresponding_gene_call, 'codon_order_in_gene': codon_order_in_gene, 'departure_from_reference': 0, 'coverage': None, 'reference': reference_codon} # DEALING WITH COVERAGE ################################################################## # some very cool but expensive shit is going on here, let me break it down for poor souls of the future. # what we want to do is to learn the coverage of this codon in the sample. all we have is the corresponding # gene call id, and the order of this codon in the gene. so here how it goes: # # learn the gene call gene_call = self.genes_in_contigs_dict[corresponding_gene_call] # the following dict converts codon orders into nt positions in contig for a geven gene call codon_order_to_nt_positions_in_contig = utils.get_codon_order_to_nt_positions_dict(gene_call) # so the nucleotide positions for this codon in the contig is the following: nt_positions_for_codon_in_contig = codon_order_to_nt_positions_in_contig[codon_order_in_gene] # but we need to convert those positions to the context of this split. so here is the start pos: split_start = self.splits_basic_info[split_name]['start'] # here we map nt positions from the contig context to split context using the start position nt_positions_for_codon_in_split = [p - split_start for p in nt_positions_for_codon_in_contig] # we acquire coverages that match to these positions coverages = split_coverage_across_samples[sample_name][nt_positions_for_codon_in_split] coverage = int(round(sum(coverages) / 3)) # and finally update the data table self.data[next_available_entry_id]['coverage'] = coverage # DEALING WITH AAs ################################################################## # here we need to put all the codons into the data table for this sample for codon in set(codon_to_AA.values()): self.data[next_available_entry_id][codon] = 0 # and finally update the frequency of the reference codon with the coverage (WHICH IS VERY BAD, # WE HAVE NO CLUE WHAT IS THE ACTUAL COVERAGE OF TRIPLICATE LINKMERS): self.data[next_available_entry_id][reference_codon] = coverage # insert additional fields for this newly added data point self.insert_additional_fields([next_available_entry_id]) next_available_entry_id += 1 self.progress.end()
#################################################################################################### # # TABLE DESCRIPTIONS FOR THE PROFILE DATABASE # #################################################################################################### clusterings_table_name = 'clusterings' clusterings_table_structure = ['clustering', 'newick'] clusterings_table_types = [ 'str' , 'str' ] states_table_name = 'states' states_table_structure = ['name', 'content', 'last_modified'] states_table_types = ['text', 'text' , 'text' ] variable_aas_table_name = 'variable_amino_acid_frequencies' variable_aas_table_structure = ['entry_id', 'sample_id', 'corresponding_gene_call', 'codon_order_in_gene', 'reference', 'departure_from_reference', 'coverage'] + sorted(list(set(codon_to_AA.values()))) variable_aas_table_types = [ 'numeric', 'text' , 'numeric' , 'numeric' , 'text' , 'numeric' , 'numeric' ] + ['numeric'] * len(list(set(codon_to_AA.values()))) variable_nts_table_name = 'variable_nucleotide_positions' variable_nts_table_structure = ['entry_id', 'sample_id', 'split_name', 'pos' , 'pos_in_contig', 'corresponding_gene_call', 'in_partial_gene_call', 'in_complete_gene_call', 'base_pos_in_codon', 'codon_order_in_gene', 'coverage', 'cov_outlier_in_split', 'cov_outlier_in_contig', 'departure_from_reference', 'competing_nts', 'reference', 'A' , 'T' , 'C' , 'G' , 'N' ] variable_nts_table_types = [ 'numeric', 'text' , 'text' , 'numeric', 'numeric' , 'numeric' , 'numeric' , 'numeric' , 'numeric' , 'numeric' , 'numeric' , 'bool' , 'bool' , 'numeric' , 'text' , 'text' , 'numeric', 'numeric', 'numeric', 'numeric', 'numeric'] views_table_name = 'views' views_table_structure = ['view_id', 'target_table'] views_table_types = [ 'str' , 'str' ] # notice that atomic data table is the only table that doesn't have a name. because how we use this table is a bit tricky. # for single profiles, contents of this table is stored as "atomic data", however, for merged profiles, # each column of the atomic data table becomes its own table, where the row names remain identical, yet columns # become sample names. atomic_data_table_structure = ['contig', 'std_coverage', 'mean_coverage', 'mean_coverage_Q2Q3', 'max_normalized_ratio', 'relative_abundance', 'detection', 'abundance', 'variability', '__parent__']