Beispiel #1
0
    def insert_additional_fields(self, keys=[]):
        if not len(keys):
            keys = self.data.keys()

        for key in keys:
            e = self.data[key]

            if self.engine == 'NT':
                freqs_list = sorted([(e[nt], nt) for nt in 'ATCGN'], reverse=True)
            elif self.engine == 'AA':
                aas = set(codon_to_AA.values())
                freqs_list = sorted([(e[aa], aa) for aa in aas], reverse=True)

            frequency_of_consensus = freqs_list[0][0]

            e['n2n1ratio'] = freqs_list[1][0] / frequency_of_consensus if frequency_of_consensus else -1
            e['consensus'] = freqs_list[0][1]

            total_frequency_of_all_but_the_consensus = sum([tpl[0] for tpl in freqs_list[1:]])
            coverage = total_frequency_of_all_but_the_consensus + frequency_of_consensus
            e['departure_from_consensus'] = total_frequency_of_all_but_the_consensus / coverage if coverage else -1
Beispiel #2
0
    def insert_additional_fields(self, keys=[]):
        if not len(keys):
            keys = self.data.keys()

        for key in keys:
            e = self.data[key]

            if self.engine == 'NT':
                freqs_list = sorted([(e[nt], nt) for nt in 'ATCGN'],
                                    reverse=True)
            elif self.engine == 'AA':
                aas = set(codon_to_AA.values())
                freqs_list = sorted([(e[aa], aa) for aa in aas], reverse=True)

            frequency_of_consensus = freqs_list[0][0]

            e['n2n1ratio'] = freqs_list[1][
                0] / frequency_of_consensus if frequency_of_consensus else -1
            e['consensus'] = freqs_list[0][1]

            total_frequency_of_all_but_the_consensus = sum(
                [tpl[0] for tpl in freqs_list[1:]])
            coverage = total_frequency_of_all_but_the_consensus + frequency_of_consensus
            e['departure_from_consensus'] = total_frequency_of_all_but_the_consensus / coverage if coverage else -1
Beispiel #3
0
#
####################################################################################################

clusterings_table_name = 'clusterings'
clusterings_table_structure = ['clustering', 'newick']
clusterings_table_types = ['str', 'str']

states_table_name = 'states'
states_table_structure = ['name', 'content', 'last_modified']
states_table_types = ['text', 'text', 'text']

variable_aas_table_name = 'variable_amino_acid_frequencies'
variable_aas_table_structure = [
    'entry_id', 'sample_id', 'corresponding_gene_call', 'codon_order_in_gene',
    'consensus', 'departure_from_consensus', 'coverage'
] + sorted(list(set(codon_to_AA.values())))
variable_aas_table_types = [
    'numeric', 'text', 'numeric', 'numeric', 'text', 'numeric', 'numeric'
] + ['numeric'] * len(list(set(codon_to_AA.values())))

variable_nts_table_name = 'variable_nucleotide_positions'
variable_nts_table_structure = [
    'entry_id', 'sample_id', 'split_name', 'pos', 'pos_in_contig',
    'corresponding_gene_call', 'in_partial_gene_call', 'in_complete_gene_call',
    'base_pos_in_codon', 'codon_order_in_gene', 'coverage',
    'cov_outlier_in_split', 'cov_outlier_in_contig',
    'departure_from_consensus', 'competing_nts', 'consensus', 'A', 'T', 'C',
    'G', 'N'
]
variable_nts_table_types = [
    'numeric', 'text', 'text', 'numeric', 'numeric', 'numeric', 'numeric',
Beispiel #4
0
####################################################################################################
#
#     TABLE DESCRIPTIONS FOR THE PROFILE DATABASE
#
####################################################################################################

clusterings_table_name               = 'clusterings'
clusterings_table_structure          = ['clustering', 'newick']
clusterings_table_types              = [   'str'    ,  'str'  ]

states_table_name                    = 'states'
states_table_structure               = ['name', 'content', 'last_modified']
states_table_types                   = ['text',  'text'  ,      'text'    ]

variable_aas_table_name              = 'variable_amino_acid_frequencies'
variable_aas_table_structure         = ['entry_id', 'sample_id', 'corresponding_gene_call', 'codon_order_in_gene', 'reference', 'departure_from_reference', 'coverage'] + sorted(list(set(codon_to_AA.values())))
variable_aas_table_types             = [ 'numeric',    'text'  ,        'numeric'         ,       'numeric'      ,    'text'  ,          'numeric'        , 'numeric' ] + ['numeric'] * len(list(set(codon_to_AA.values())))

variable_nts_table_name              = 'variable_nucleotide_positions'
variable_nts_table_structure         = ['entry_id', 'sample_id', 'split_name',   'pos'  , 'pos_in_contig', 'corresponding_gene_call', 'in_partial_gene_call', 'in_complete_gene_call', 'base_pos_in_codon', 'codon_order_in_gene', 'coverage', 'cov_outlier_in_split', 'cov_outlier_in_contig', 'departure_from_reference', 'competing_nts', 'reference',    'A'   ,    'T'   ,    'C'   ,    'G'   ,    'N'   ]
variable_nts_table_types             = [ 'numeric',    'text'  ,    'text'   , 'numeric',    'numeric'   ,        'numeric'         ,       'numeric'       ,       'numeric'        ,       'numeric'    ,       'numeric'      , 'numeric' ,          'bool'       ,          'bool'        ,          'numeric'        ,      'text'    ,    'text'  , 'numeric', 'numeric', 'numeric', 'numeric', 'numeric']

gene_coverages_table_name            = 'gene_coverages'
gene_coverages_table_structure       = ['entry_id', 'gene_callers_id', 'sample_id', 'mean_coverage']
gene_coverages_table_types           = [ 'numeric',     'numeric'    ,   'text'   ,    'numeric'   ]

views_table_name                     = 'views'
views_table_structure                = ['view_id', 'target_table']
views_table_types                    = [  'str'  ,      'str'    ]

# notice that atomic data table is the only table that doesn't have a name. because how we use this table is a bit tricky.
Beispiel #5
0
    def recover_base_frequencies_for_all_samples(self):
        self.progress.new('Recovering AA frequencies for all')

        samples_wanted = self.samples_of_interest if self.samples_of_interest else self.sample_ids
        splits_wanted = self.splits_of_interest if self.splits_of_interest else set(
            self.splits_basic_info.keys())
        next_available_entry_id = max(self.data.keys()) + 1

        unique_pos_identifier_str_to_consenus_codon = {}
        unique_pos_identifier_str_to_unique_pos_identifier = {}
        for e in self.data.values():
            upi = e['unique_pos_identifier_str']
            unique_pos_identifier_str_to_consenus_codon[upi] = e['consensus']
            unique_pos_identifier_str_to_unique_pos_identifier[upi] = e[
                'unique_pos_identifier']

        self.progress.update(
            'creating a dict to track missing AA frequencies for each sample / split / pos'
        )

        splits_to_consider = {}
        for split_name in splits_wanted:
            splits_to_consider[split_name] = {}

        self.progress.update(
            'populating the dict to track missing AA frequencies for each sample / split / pos'
        )
        for entry_id in self.data:
            v = self.data[entry_id]
            gene_codon_key = '%d_%d' % (v['corresponding_gene_call'],
                                        v['codon_order_in_gene'])
            d = splits_to_consider[v['split_name']]

            if d.has_key(gene_codon_key):
                d[gene_codon_key].remove(v['sample_id'])
            else:
                d[gene_codon_key] = copy.deepcopy(samples_wanted)
                d[gene_codon_key].remove(v['sample_id'])

        counter = 0
        for split_name in splits_to_consider:
            counter += 1
            self.progress.update(
                'accessing split coverages and updating variable positions dict :: %s'
                % pp(counter))

            split_coverage_across_samples = self.merged_split_coverage_values.get(
                split_name)

            split_info = self.splits_basic_info[split_name]
            contig_name = split_info['parent']

            for gene_codon_key in splits_to_consider[split_name]:
                corresponding_gene_call, codon_order_in_gene = [
                    int(k) for k in gene_codon_key.split('_')
                ]

                for sample_name in splits_to_consider[split_name][
                        gene_codon_key]:
                    unique_pos_identifier_str = '_'.join([
                        split_name,
                        str(corresponding_gene_call),
                        str(codon_order_in_gene)
                    ])
                    consensus_codon = unique_pos_identifier_str_to_consenus_codon[
                        unique_pos_identifier_str]

                    self.data[next_available_entry_id] = {
                        'unique_pos_identifier_str':
                        unique_pos_identifier_str,
                        'unique_pos_identifier':
                        unique_pos_identifier_str_to_unique_pos_identifier[
                            unique_pos_identifier_str],
                        'sample_id':
                        sample_name,
                        'split_name':
                        split_name,
                        'contig_name':
                        contig_name,
                        'corresponding_gene_call':
                        corresponding_gene_call,
                        'codon_order_in_gene':
                        codon_order_in_gene,
                        'departure_from_consensus':
                        0,
                        'coverage':
                        None,
                        'consensus':
                        consensus_codon
                    }

                    # DEALING WITH COVERAGE ##################################################################
                    # some very cool but expensive shit is going on here, let me break it down for poor souls of the future.
                    # what we want to do is to learn the coverage of this codon in the sample. all we have is the corresponding
                    # gene call id, and the order of this codon in the gene. so here how it goes:
                    #
                    # learn the gene call
                    gene_call = self.genes_in_contigs_dict[
                        corresponding_gene_call]

                    # the following dict converts codon orders into nt positions in contig for a geven gene call
                    codon_order_to_nt_positions_in_contig = utils.get_codon_order_to_nt_positions_dict(
                        gene_call)

                    # so the nucleotide positions for this codon in the contig is the following:
                    nt_positions_for_codon_in_contig = codon_order_to_nt_positions_in_contig[
                        codon_order_in_gene]

                    # but we need to convert those positions to the context of this split. so here is the start pos:
                    split_start = self.splits_basic_info[split_name]['start']

                    # here we map nt positions from the contig context to split context using the start position
                    nt_positions_for_codon_in_split = [
                        p - split_start
                        for p in nt_positions_for_codon_in_contig
                    ]

                    # we acquire coverages that match to these positions
                    coverages = split_coverage_across_samples[sample_name][
                        nt_positions_for_codon_in_split]
                    coverage = int(round(sum(coverages) / 3))

                    # and finally update the data table
                    self.data[next_available_entry_id]['coverage'] = coverage

                    # DEALING WITH AAs ##################################################################
                    # here we need to put all the codons into the data table for this sample
                    for codon in set(codon_to_AA.values()):
                        self.data[next_available_entry_id][codon] = 0

                    # and finally update the frequency of the consensus codon with the coverage (WHICH IS VERY BAD,
                    # WE HAVE NO CLUE WHAT IS THE ACTUAL COVERAGE OF TRIPLICATE LINKMERS):
                    self.data[next_available_entry_id][
                        consensus_codon] = coverage

                    next_available_entry_id += 1

        self.progress.end()
Beispiel #6
0
    def recover_base_frequencies_for_all_samples(self):
        self.progress.new('Recovering AA frequencies for all')

        samples_wanted = self.samples_of_interest if self.samples_of_interest else self.sample_ids
        splits_wanted = self.splits_of_interest if self.splits_of_interest else set(self.splits_basic_info.keys())
        next_available_entry_id = max(self.data.keys()) + 1

        unique_pos_identifier_str_to_consenus_codon = {}
        unique_pos_identifier_str_to_unique_pos_identifier = {}
        for e in self.data.values():
            upi = e['unique_pos_identifier_str']
            unique_pos_identifier_str_to_consenus_codon[upi] = e['reference']
            unique_pos_identifier_str_to_unique_pos_identifier[upi] = e['unique_pos_identifier']

        self.progress.update('creating a dict to track missing AA frequencies for each sample / split / pos')

        splits_to_consider = {}
        for split_name in splits_wanted:
            splits_to_consider[split_name] = {}

        self.progress.update('populating the dict to track missing AA frequencies for each sample / split / pos')
        for entry_id in self.data:
            v = self.data[entry_id]
            gene_codon_key = '%d_%d' % (v['corresponding_gene_call'], v['codon_order_in_gene'])
            d = splits_to_consider[v['split_name']]

            if gene_codon_key in d:
                d[gene_codon_key].remove(v['sample_id'])
            else:
                d[gene_codon_key] = copy.deepcopy(samples_wanted)
                d[gene_codon_key].remove(v['sample_id'])

        counter = 0
        for split_name in splits_to_consider:
            counter += 1
            self.progress.update('accessing split coverages and updating variable positions dict :: %s' % pp(counter))

            split_coverage_across_samples = self.merged_split_coverage_values.get(split_name)

            split_info = self.splits_basic_info[split_name]
            contig_name = split_info['parent']

            for gene_codon_key in splits_to_consider[split_name]:
                corresponding_gene_call, codon_order_in_gene = [int(k) for k in gene_codon_key.split('_')]

                for sample_name in splits_to_consider[split_name][gene_codon_key]:
                    unique_pos_identifier_str = '_'.join([split_name, str(corresponding_gene_call), str(codon_order_in_gene)])
                    reference_codon = unique_pos_identifier_str_to_consenus_codon[unique_pos_identifier_str]

                    self.data[next_available_entry_id] = {'unique_pos_identifier_str': unique_pos_identifier_str,
                                                          'unique_pos_identifier': unique_pos_identifier_str_to_unique_pos_identifier[unique_pos_identifier_str],
                                                          'sample_id': sample_name,
                                                          'split_name': split_name,
                                                          'contig_name': contig_name,
                                                          'corresponding_gene_call': corresponding_gene_call,
                                                          'codon_order_in_gene': codon_order_in_gene,
                                                          'departure_from_reference': 0,
                                                          'coverage': None,
                                                          'reference': reference_codon}

                    # DEALING WITH COVERAGE ##################################################################
                    # some very cool but expensive shit is going on here, let me break it down for poor souls of the future.
                    # what we want to do is to learn the coverage of this codon in the sample. all we have is the corresponding
                    # gene call id, and the order of this codon in the gene. so here how it goes:
                    #
                    # learn the gene call
                    gene_call = self.genes_in_contigs_dict[corresponding_gene_call]

                    # the following dict converts codon orders into nt positions in contig for a geven gene call
                    codon_order_to_nt_positions_in_contig = utils.get_codon_order_to_nt_positions_dict(gene_call)

                    # so the nucleotide positions for this codon in the contig is the following:
                    nt_positions_for_codon_in_contig = codon_order_to_nt_positions_in_contig[codon_order_in_gene]

                    # but we need to convert those positions to the context of this split. so here is the start pos:
                    split_start = self.splits_basic_info[split_name]['start']

                    # here we map nt positions from the contig context to split context using the start position
                    nt_positions_for_codon_in_split = [p - split_start for p in nt_positions_for_codon_in_contig]

                    # we acquire coverages that match to these positions
                    coverages = split_coverage_across_samples[sample_name][nt_positions_for_codon_in_split]
                    coverage = int(round(sum(coverages) / 3))

                    # and finally update the data table
                    self.data[next_available_entry_id]['coverage'] = coverage

                    # DEALING WITH AAs ##################################################################
                    # here we need to put all the codons into the data table for this sample
                    for codon in set(codon_to_AA.values()):
                        self.data[next_available_entry_id][codon] = 0

                    # and finally update the frequency of the reference codon with the coverage (WHICH IS VERY BAD,
                    # WE HAVE NO CLUE WHAT IS THE ACTUAL COVERAGE OF TRIPLICATE LINKMERS):
                    self.data[next_available_entry_id][reference_codon] = coverage

                    # insert additional fields for this newly added data point
                    self.insert_additional_fields([next_available_entry_id])

                    next_available_entry_id += 1

        self.progress.end()
Beispiel #7
0
####################################################################################################
#
#     TABLE DESCRIPTIONS FOR THE PROFILE DATABASE
#
####################################################################################################

clusterings_table_name               = 'clusterings'
clusterings_table_structure          = ['clustering', 'newick']
clusterings_table_types              = [   'str'    ,  'str'  ]

states_table_name                    = 'states'
states_table_structure               = ['name', 'content', 'last_modified']
states_table_types                   = ['text',  'text'  ,      'text'    ]

variable_aas_table_name              = 'variable_amino_acid_frequencies'
variable_aas_table_structure         = ['entry_id', 'sample_id', 'corresponding_gene_call', 'codon_order_in_gene', 'reference', 'departure_from_reference', 'coverage'] + sorted(list(set(codon_to_AA.values())))
variable_aas_table_types             = [ 'numeric',    'text'  ,        'numeric'         ,       'numeric'      ,    'text'  ,          'numeric'        , 'numeric' ] + ['numeric'] * len(list(set(codon_to_AA.values())))

variable_nts_table_name              = 'variable_nucleotide_positions'
variable_nts_table_structure         = ['entry_id', 'sample_id', 'split_name',   'pos'  , 'pos_in_contig', 'corresponding_gene_call', 'in_partial_gene_call', 'in_complete_gene_call', 'base_pos_in_codon', 'codon_order_in_gene', 'coverage', 'cov_outlier_in_split', 'cov_outlier_in_contig', 'departure_from_reference', 'competing_nts', 'reference',    'A'   ,    'T'   ,    'C'   ,    'G'   ,    'N'   ]
variable_nts_table_types             = [ 'numeric',    'text'  ,    'text'   , 'numeric',    'numeric'   ,        'numeric'         ,       'numeric'       ,       'numeric'        ,       'numeric'    ,       'numeric'      , 'numeric' ,          'bool'       ,          'bool'        ,          'numeric'        ,      'text'    ,    'text'  , 'numeric', 'numeric', 'numeric', 'numeric', 'numeric']

views_table_name                     = 'views'
views_table_structure                = ['view_id', 'target_table']
views_table_types                    = [  'str'  ,      'str'    ]

# notice that atomic data table is the only table that doesn't have a name. because how we use this table is a bit tricky.
# for single profiles, contents of this table is stored as "atomic data", however, for merged profiles,
# each column of the atomic data table becomes its own table, where the row names remain identical, yet columns
# become sample names. 
atomic_data_table_structure          = ['contig', 'std_coverage', 'mean_coverage', 'mean_coverage_Q2Q3', 'max_normalized_ratio', 'relative_abundance', 'detection', 'abundance', 'variability', '__parent__']