Beispiel #1
0
    def test_three_letter_amino_acid_residue_pyrrolysine(self):

        _mappings = []
        _mappings.append(
            mock_Mapping(id=1,
                         base_pair='T',
                         codon='TAG',
                         codon_base_pair_position=0,
                         amino_acid_residue='U',
                         cDNA_position=1,
                         chromosome_position=1))
        _mappings.append(
            mock_Mapping(id=2,
                         base_pair='A',
                         codon='TAG',
                         codon_base_pair_position=1,
                         amino_acid_residue='U',
                         cDNA_position=2,
                         chromosome_position=2))
        _mappings.append(
            mock_Mapping(id=3,
                         base_pair='G',
                         codon='TAG',
                         codon_base_pair_position=2,
                         amino_acid_residue='U',
                         cDNA_position=3,
                         chromosome_position=3))

        # Create the Codon
        codon = Codon.initializeFromMapping(_mappings, 'test_transcript',
                                            'test_protein_ac')

        self.assertTrue(codon.three_letter_amino_acid_residue() == 'Sec')
Beispiel #2
0
    def get_alignment_depth_for_consensus_position(self, consensus_position):
        """Retrieves the number of aligned codons for this consensus position"""
        if consensus_position < 0:
            raise ConsensusPositionOutOfBounds(
                "The provided consensus position ('" +
                str(consensus_position) +
                "') is below zero, this position foes not exist")
        if consensus_position >= self.consensus_length:
            raise ConsensusPositionOutOfBounds(
                "The provided consensus position ('" +
                str(consensus_position) +
                "') is above the maximum consensus length ('" +
                str(self.consensus_length) +
                "'), this position foes not exist")

        # Retrieve all codons aligned to the consensus position
        aligned_to_position = self.meta_domain_mapping[
            self.meta_domain_mapping.consensus_pos ==
            consensus_position].to_dict('records')

        unique_keys = [
            Codon.initializeFromDict(codon_dict).unique_str_representation()
            for codon_dict in aligned_to_position
        ]
        return len(np.unique(unique_keys))
Beispiel #3
0
 def retrieve_codon_for_protein_position(self, protein_pos):
     """Returns the codon for this gene region at the protein position"""
     _codon = None
     # retrieve te mappings
     _mappings = []
     for cDNA_pos in self.protein_pos_to_cDNA[protein_pos]:
         _mappings.append(self.mappings_per_cDNA[cDNA_pos])
     # create the codon
     _codon = Codon.initializeFromMapping(_mappings, self.gencode_transcription_id, self.uniprot_ac)
     return _codon
Beispiel #4
0
    def toDict(self):
        # create the dictionary based on the codon
        _d = Codon.toDict(self)
        
        # Add the variables of SNV
        _d['ref_nucleotide'] = self.ref_nucleotide
        _d['alt_nucleotide'] = self.alt_nucleotide
        _d['var_codon_position'] = self.var_codon_position
        _d['variant_type'] = self.variant_type.value
        _d['alt_amino_acid_residue'] = self.alt_amino_acid_residue
        _d['variant_source'] = self.variant_source

        return _d
Beispiel #5
0
    def get_codon_for_transcript_and_position(self, transcript_id,
                                              protein_position):
        """Construct the codon for a provided position"""
        # Retrieve all codons aligned to the consensus position
        aligned_to_position = self.meta_domain_mapping[(
            self.meta_domain_mapping.gencode_transcription_id == transcript_id
        ) & (self.meta_domain_mapping.amino_acid_position == protein_position
             )].to_dict('records')

        if len(aligned_to_position) == 0:
            raise NotInMetaDomain(
                "No codons found to be aligned for metadomain '" +
                str(self.domain_id) + "' for transcript '" +
                str(transcript_id) + "' at position '" +
                str(protein_position) + "'")
        else:
            return Codon.initializeFromDict(aligned_to_position[0])
Beispiel #6
0
    def get_codons_aligned_to_consensus_position(self, consensus_position):
        """Retrieves codons for this consensus position as:
        {Codon.unique_str_representation(): Codon}"""
        codons = dict()

        if consensus_position < 0:
            raise ConsensusPositionOutOfBounds(
                "The provided consensus position ('" +
                str(consensus_position) +
                "') is below zero, this position foes not exist")
        if consensus_position >= self.consensus_length:
            raise ConsensusPositionOutOfBounds(
                "The provided consensus position ('" +
                str(consensus_position) +
                "') is above the maximum consensus length ('" +
                str(self.consensus_length) +
                "'), this position foes not exist")

        # Retrieve all codons aligned to the consensus position
        aligned_to_position = self.meta_domain_mapping[
            self.meta_domain_mapping.consensus_pos ==
            consensus_position].to_dict('records')

        # first check if the consensus position is present in the mappings_per_consensus_pos
        if len(aligned_to_position) > 0:
            for codon_dict in aligned_to_position:
                # initialize a codon from the dataframe row
                codon = Codon.initializeFromDict(codon_dict)

                # aggregate duplicate chromosomal regions
                if not codon.unique_str_representation() in codons.keys():
                    codons[codon.unique_str_representation()] = []

                # add the codon to the dictionary
                codons[codon.unique_str_representation()].append(codon)

        # return the codons that correspond to this position
        return codons
Beispiel #7
0
 def initializeFromDict(cls, _d):
     try:
         # Double check the _codon is properly formatted
         _codon = Codon.initializeFromDict(_d)
         
         # initialize the other expected values from 
         _ref_nucleotide = _d['ref_nucleotide']
         _alt_nucleotide = _d['alt_nucleotide']
         _var_codon_position = _d['var_codon_position']
         _variant_type = _d['variant_type']
         _alt_amino_acid_residue = _d['alt_amino_acid_residue']
         _variant_source = _d['variant_source']
         
         SNV = cls(_gencode_transcription_id=_codon.gencode_transcription_id,
                                 _uniprot_ac=_codon.uniprot_ac, _strand=_codon.strand.value,
                                 _base_pair_representation=_codon.base_pair_representation, 
                                 _amino_acid_residue=_codon.amino_acid_residue, 
                                 _amino_acid_position=_codon.amino_acid_position, 
                                 _chr=_codon.chr, 
                                 _chromosome_position_base_pair_one=_codon.chromosome_position_base_pair_one, 
                                 _chromosome_position_base_pair_two=_codon.chromosome_position_base_pair_two, 
                                 _chromosome_position_base_pair_three=_codon.chromosome_position_base_pair_three,
                                 _cDNA_position_one=_codon.cDNA_position_one, 
                                 _cDNA_position_two=_codon.cDNA_position_two, 
                                 _cDNA_position_three=_codon.cDNA_position_three,
                                 _variant_type=_variant_type, 
                                 _alt_amino_acid_residue=_alt_amino_acid_residue, 
                                 _ref_nucleotide=_ref_nucleotide, 
                                 _alt_nucleotide=_alt_nucleotide,
                                 _var_codon_position=_var_codon_position,
                                 _variant_source=_variant_source)
         
         return SNV
     except MalformedCodonException as e:
         raise MalformedCodonException("No SNV could be made: Malformed codon from dict: KeyError with message: "+str(e))
     except KeyError as e:
         raise MalformedVariantException("No SNV could be made: Malformed variant from dict: KeyError with message: "+str(e))
Beispiel #8
0
def generate_pfam_aligned_codons(pfam_id):
    """
    Generates a list of dictionaries (meta_codons_per_consensus_pos)
    wherein all aligned codons per domain consensus positions are located
    Also provides the consensus_length of the domain and the n_instances
    """
    _log.info("Started a meta-domain based on the alignment of all '" +
              pfam_id + "' Pfam domains in the human genome")
    start_time = time.clock()

    # the consensus length
    consensus_length = 0
    # the meta_domain that is to be returned
    meta_codons_per_consensus_pos = {}
    # the mapping of the protein {protein_id: {protein_posistion: consensus_position}}
    consensus_pos_per_protein = {}
    # the amount of domain occurrences found
    n_instances = 0

    # retrieve the alignment
    hmmeralign_output = interpret_hmm_alignment_file(
        METADOMAIN_DIR + pfam_id + '/' + METADOMAIN_ALIGNMENT_FILE_NAME)
    if not len(hmmeralign_output) == 0:
        #update the consensus length
        consensus_length = len(hmmeralign_output['consensus']['sequence'])

        # update the number of instances
        n_instances = len(hmmeralign_output['alignments'])
        _log.debug(
            "Creating the alignment of mappings for '" + str(n_instances) +
            "' '" + pfam_id +
            "' domain occurrences based on the HMM alignment to consensus and original domain sequence"
        )

        # ensure we can map consensus residues back to consensus positions
        hmmeralign_output['consensus'][
            'aligned_sequence'] = convert_pfam_fasta_alignment_to_original_aligned_sequence(
                hmmeralign_output['consensus']['alignment'])
        hmmeralign_output['consensus'][
            'mapping_consensus_alignment_to_positions'] = map_sequence_to_aligned_sequence(
                hmmeralign_output['consensus']['sequence'],
                hmmeralign_output['consensus']['aligned_sequence'])

        # create mappings between domain occurrences and the domain consensus sequence
        for _alignment in hmmeralign_output['alignments']:
            # retrieve current aligned domain

            # Create a mapping from the aligned domain sequence to the domain sequence
            aligned_sequence = convert_pfam_fasta_alignment_to_original_aligned_sequence(
                _alignment['alignment'])
            original_sequence = convert_pfam_fasta_alignment_to_strict_sequence(
                aligned_sequence)
            mapping_domain_alignment_to_sequence_positions = map_sequence_to_aligned_sequence(
                original_sequence, aligned_sequence)

            # Generate the strict sequence for this domain; leaving only residues that were aligned to the domain consensus
            strict_aligned_sequence = convert_pfam_fasta_alignment_to_strict_fasta(
                _alignment['alignment'])

            # create the mapping between the strict alignments and the original consensus sequence
            mapping_aligned_domain_to_domain_consensus = createAlignedSequenceMapping(
                strict_aligned_sequence,
                hmmeralign_output['consensus']['aligned_sequence'], False)

            # create a list of mapping positions that includes insertions
            mapping_positions = list(
                mapping_domain_alignment_to_sequence_positions.keys()) + list(
                    set(mapping_aligned_domain_to_domain_consensus.keys()) -
                    set(mapping_domain_alignment_to_sequence_positions.keys()))

            # Second add each aligned residue mapping
            for mapping_pos in sorted(mapping_positions):
                # retrieve the residue at the consensus position and the residue at the domain position
                consensus_domain_residue = hmmeralign_output['consensus'][
                    'aligned_sequence'][mapping_pos]

                if consensus_domain_residue == '-':
                    # Set the default values for the insertion
                    continue
                else:
                    # retrieve the position in the domain consensus
                    domain_consensus_pos = hmmeralign_output['consensus'][
                        'mapping_consensus_alignment_to_positions'][
                            mapping_pos]

                # retrieve the position in the domain sequence
                ref_pos = mapping_domain_alignment_to_sequence_positions[
                    mapping_pos]
                # convert the position in the domain sequence to the uniprot position and genomic position
                uniprot_pos = int(_alignment['start_pos']) + ref_pos - 1

                # Add the consensus pos to the protein
                if not _alignment[
                        'uniprot_ac'] in consensus_pos_per_protein.keys():
                    consensus_pos_per_protein[_alignment['uniprot_ac']] = {}
                if not uniprot_pos in consensus_pos_per_protein[
                        _alignment['uniprot_ac']].keys():
                    consensus_pos_per_protein[
                        _alignment['uniprot_ac']][uniprot_pos] = []
                consensus_pos_per_protein[_alignment['uniprot_ac']][
                    uniprot_pos].append(domain_consensus_pos)

        # now incorporate the alignment data into our domain model in form of mappings
        # First get the protein ids for the uniprot acs
        uniprot_acs_to_ids = ProteinRepository.retrieve_protein_id_for_multiple_protein_acs(
            [x for x in consensus_pos_per_protein.keys()])
        protein_ids = [
            int(y) for y in np.unique([x for x in uniprot_acs_to_ids.values()])
        ]

        # Second, get all mappings for these proteins
        protein_mappings = MappingRepository.get_mappings_for_multiple_protein_ids(
            protein_ids)

        # retrieve all transcripts mapped to these protein_ids
        gene_ids = GeneRepository.retrieve_transcript_id_for_multiple_protein_ids(
            protein_ids)

        # create all aligned codons
        meta_codons_per_consensus_pos = {}
        for uniprot_ac in consensus_pos_per_protein.keys():
            for uniprot_pos in consensus_pos_per_protein[uniprot_ac].keys():
                for domain_consensus_pos in consensus_pos_per_protein[
                        uniprot_ac][uniprot_pos]:
                    # Retrieve the mapping for the corresponding uniprot_position
                    mappings_for_uniprot_pos = [
                        x for x in protein_mappings[
                            uniprot_acs_to_ids[uniprot_ac]]
                        if x.uniprot_position == uniprot_pos
                    ]

                    # Seperate the mappings per gene_id
                    mapping_per_gene_id = {}
                    for mapping in mappings_for_uniprot_pos:
                        if not mapping.gene_id in mapping_per_gene_id.keys():
                            mapping_per_gene_id[mapping.gene_id] = []
                        mapping_per_gene_id[mapping.gene_id].append(mapping)

                    for gene_id in mapping_per_gene_id.keys():
                        # Obtain the mappings for this position
                        mappings = mapping_per_gene_id[gene_id]

                        try:
                            # create a codon
                            codon = Codon.initializeFromMapping(
                                mappings, gene_ids[gene_id], uniprot_ac)

                            # Add the codon to the consensus positions
                            if not domain_consensus_pos in meta_codons_per_consensus_pos.keys(
                            ):
                                meta_codons_per_consensus_pos[
                                    domain_consensus_pos] = []

                            meta_codons_per_consensus_pos[
                                domain_consensus_pos].append(codon)
                        except MalformedCodonException as e:
                            raise MalformedMappingsForAlignedCodonsPosition(
                                "Encountered a malformed codon mapping for domain '"
                                + str(pfam_id) + "' in gene '" + str(gene_id) +
                                "', at amino_acid_position '" +
                                str(uniprot_pos) + "':" + str(e))

    time_step = time.clock()
    _log.info("Finished the alignment of mappings for '" + str(n_instances) +
              "' instances '" + pfam_id + "' domain occurrences in " +
              str(time_step - start_time) + " seconds")
    return meta_codons_per_consensus_pos, consensus_length, n_instances
Beispiel #9
0
 def alt_three_letter_amino_acid_residue(self):
     """Returns a three letter representation of the amino acid residue for this codon"""
     if self.alt_amino_acid_residue == '*':
         return self.alt_amino_acid_residue
     else:
         return Codon.one_to_three_letter_amino_acid_residue(self.alt_amino_acid_residue)
Beispiel #10
0
    def test_initializations(self):
        # init test variables
        _transcript = 'test_transcript'
        _protein_ac = 'test_test_protein_ac'
        _codon_repr = 'CTT'
        _residue = 'L'
        _strand = '+'
        _chromosome_position_base_pair_one = 231
        _chromosome_position_base_pair_two = 232
        _chromosome_position_base_pair_three = 233
        _cDNA_position_one = 333
        _cDNA_position_two = 334
        _cDNA_position_three = 335

        # init mappings
        _mappings = []
        _mappings.append(
            mock_Mapping(
                id=1,
                base_pair='C',
                codon=_codon_repr,
                codon_base_pair_position=0,
                amino_acid_residue=_residue,
                cDNA_position=_cDNA_position_one,
                chromosome_position=_chromosome_position_base_pair_one))
        _mappings.append(
            mock_Mapping(
                id=2,
                base_pair='T',
                codon=_codon_repr,
                codon_base_pair_position=1,
                amino_acid_residue=_residue,
                cDNA_position=_cDNA_position_two,
                chromosome_position=_chromosome_position_base_pair_two))
        _mappings.append(
            mock_Mapping(
                id=3,
                base_pair='T',
                codon=_codon_repr,
                codon_base_pair_position=2,
                amino_acid_residue=_residue,
                cDNA_position=_cDNA_position_three,
                chromosome_position=_chromosome_position_base_pair_three))

        # Create the Codon from the mapping
        _codon_from_mapping = Codon.initializeFromMapping(
            _mappings=_mappings,
            _gencode_transcription_id=_transcript,
            _uniprot_ac=_protein_ac)
        _codon_from_init = Codon(
            _gencode_transcription_id=_transcript,
            _uniprot_ac=_protein_ac,
            _strand=_strand,
            _base_pair_representation=_codon_repr,
            _amino_acid_residue=_residue,
            _amino_acid_position=mock_Mapping.amino_acid_position,
            _chr=mock_Mapping.chromosome,
            _chromosome_position_base_pair_one=
            _chromosome_position_base_pair_one,
            _chromosome_position_base_pair_two=
            _chromosome_position_base_pair_two,
            _chromosome_position_base_pair_three=
            _chromosome_position_base_pair_three,
            _cDNA_position_one=_cDNA_position_one,
            _cDNA_position_two=_cDNA_position_two,
            _cDNA_position_three=_cDNA_position_three)

        # Check if the conversion went okay and it is the same as init
        self.assertTrue(_codon_from_init.three_letter_amino_acid_residue(
        ) == _codon_from_mapping.three_letter_amino_acid_residue() == 'Leu')
        self.assertTrue(
            _codon_from_init.gencode_transcription_id ==
            _codon_from_mapping.gencode_transcription_id == _transcript)
        self.assertTrue(_codon_from_init.uniprot_ac ==
                        _codon_from_mapping.uniprot_ac == _protein_ac)
        self.assertTrue(_codon_from_init.strand == _codon_from_mapping.strand
                        == mock_Mapping.strand)
        self.assertTrue(
            _codon_from_init.base_pair_representation ==
            _codon_from_mapping.base_pair_representation == _codon_repr)
        self.assertTrue(_codon_from_init.amino_acid_residue ==
                        _codon_from_mapping.amino_acid_residue == _residue)
        self.assertTrue(
            _codon_from_init.amino_acid_position == _codon_from_mapping.
            amino_acid_position == mock_Mapping.amino_acid_position)
        self.assertTrue(_codon_from_init.chr == _codon_from_mapping.chr ==
                        mock_Mapping.chromosome)
        self.assertTrue(_codon_from_init.chromosome_position_base_pair_one ==
                        _codon_from_mapping.chromosome_position_base_pair_one
                        == _chromosome_position_base_pair_one)
        self.assertTrue(_codon_from_init.chromosome_position_base_pair_two ==
                        _codon_from_mapping.chromosome_position_base_pair_two
                        == _chromosome_position_base_pair_two)
        self.assertTrue(_codon_from_init.chromosome_position_base_pair_three ==
                        _codon_from_mapping.chromosome_position_base_pair_three
                        == _chromosome_position_base_pair_three)
        self.assertTrue(
            _codon_from_init.cDNA_position_one ==
            _codon_from_mapping.cDNA_position_one == _cDNA_position_one)
        self.assertTrue(
            _codon_from_init.cDNA_position_two ==
            _codon_from_mapping.cDNA_position_two == _cDNA_position_two)
        self.assertTrue(
            _codon_from_init.cDNA_position_three ==
            _codon_from_mapping.cDNA_position_three == _cDNA_position_three)

        # Create a dictionary from the codon
        _d = _codon_from_mapping.toDict()
        _codon_from_dict = Codon.initializeFromDict(_d)

        # Check if the conversion went okay and it is the same as init
        self.assertTrue(_codon_from_dict.three_letter_amino_acid_residue() ==
                        _codon_from_mapping.three_letter_amino_acid_residue())
        self.assertTrue(_codon_from_dict.gencode_transcription_id ==
                        _codon_from_mapping.gencode_transcription_id)
        self.assertTrue(
            _codon_from_dict.uniprot_ac == _codon_from_mapping.uniprot_ac)
        self.assertTrue(_codon_from_dict.strand == _codon_from_mapping.strand)
        self.assertTrue(_codon_from_dict.base_pair_representation ==
                        _codon_from_mapping.base_pair_representation)
        self.assertTrue(_codon_from_dict.amino_acid_residue ==
                        _codon_from_mapping.amino_acid_residue)
        self.assertTrue(_codon_from_dict.amino_acid_position ==
                        _codon_from_mapping.amino_acid_position)
        self.assertTrue(_codon_from_dict.chr == _codon_from_mapping.chr)
        self.assertTrue(_codon_from_dict.chromosome_position_base_pair_one ==
                        _codon_from_mapping.chromosome_position_base_pair_one)
        self.assertTrue(_codon_from_dict.chromosome_position_base_pair_two ==
                        _codon_from_mapping.chromosome_position_base_pair_two)
        self.assertTrue(
            _codon_from_dict.chromosome_position_base_pair_three ==
            _codon_from_mapping.chromosome_position_base_pair_three)
        self.assertTrue(_codon_from_dict.cDNA_position_one ==
                        _codon_from_mapping.cDNA_position_one)
        self.assertTrue(_codon_from_dict.cDNA_position_two ==
                        _codon_from_mapping.cDNA_position_two)
        self.assertTrue(_codon_from_dict.cDNA_position_three ==
                        _codon_from_mapping.cDNA_position_three)
Beispiel #11
0
    def test_init_from_dict_fail(self):
        _d = {}

        with self.assertRaises(MalformedCodonException):
            Codon.initializeFromDict(_d)