Example #1
0
    def generate_transcript_change_from_tx(self, tx, variant_type, vc, start_genomic_space, end_genomic_space, ref_allele, alt_allele):
        """

        :param vc:
        :return:
        """

        if vc.get_vc() == VariantClassification.SPLICE_SITE and vc.get_secondary_vc() == VariantClassification.INTRON:
            return ""
            # dist_from_exon = self._get_splice_site_coordinates(tx, start_genomic_space, end_genomic_space, vc.get_exon_i())
            # exon_i = vc.get_exon_i()
            # return TranscriptProviderUtils.render_splice_site_transcript_change(tx, dist_from_exon, exon_i, vc.get_secondary_vc() == VariantClassification.INTRON)

        if vc.get_cds_start_in_exon_space() == "" or vc.get_cds_start_in_exon_space() < 0:
            return ""
        exon_position_start,exon_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(int(start_genomic_space), int(end_genomic_space), tx)

        if tx.get_strand() == "-":
            cds_position_start_cds_space = exon_position_start - int(vc.get_cds_start_in_exon_space())+1
            cds_position_end_cds_space = exon_position_end - int(vc.get_cds_start_in_exon_space())+1
        else:
            cds_position_start_cds_space = exon_position_start - int(vc.get_cds_start_in_exon_space())
            cds_position_end_cds_space = exon_position_end - int(vc.get_cds_start_in_exon_space())

        observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(ref_allele, alt_allele, tx)
        result = TranscriptProviderUtils.render_transcript_change(variant_type, vc.get_vc(), cds_position_start_cds_space, cds_position_end_cds_space, reference_allele_stranded, observed_allele_stranded, vc.get_secondary_vc())
        return result
    def annotate_mutation(self, mutation):
        chr = mutation.chr
        start = int(mutation.start)
        end = int(mutation.end)
        txs = self.get_transcripts_by_pos(chr, start, end)
        final_annotation_dict = self._create_blank_set_of_annotations()
        final_annotation_dict['variant_type'] = Annotation(value=TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), datasourceName=self.title)
        chosen_tx = None

        # We have hit IGR if no transcripts come back.  Most annotations can just use the blank set.
        if len(txs) == 0:
            final_annotation_dict['variant_classification'] = self._create_basic_annotation(VariantClassification.IGR)
            nearest_genes = self._get_nearest_genes(chr, int(start), int(end))
            final_annotation_dict['other_transcripts'] = self._create_basic_annotation(value='%s (%s upstream) : %s (%s downstream)' % (nearest_genes[0][0], nearest_genes[0][1], nearest_genes[1][0], nearest_genes[1][1]))
            final_annotation_dict['gene'] = self._create_basic_annotation('Unknown')
            final_annotation_dict['gene_id'] = self._create_basic_annotation('0')
            final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value))
        else:
            # Choose the best effect transcript
            chosen_tx = self._choose_transcript(txs, self.get_tx_mode(), final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, start, end)
            vcer = VariantClassifier()

            final_annotation_dict['annotation_transcript'] = self._create_basic_annotation(chosen_tx.get_transcript_id())
            final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value))
            final_annotation_dict['strand'] = self._create_basic_annotation(chosen_tx.get_strand())

            final_annotation_dict['transcript_position'] = self._create_basic_annotation(TranscriptProviderUtils.render_transcript_position(int(start), int(end), chosen_tx))

            final_annotation_dict['transcript_id'] = self._create_basic_annotation(chosen_tx.get_transcript_id())

            variant_classfication = vcer.variant_classify(tx=chosen_tx, variant_type=final_annotation_dict['variant_type'].value,
                                             ref_allele=mutation.ref_allele, alt_allele=mutation.alt_allele, start=mutation.start, end=mutation.end)
            final_annotation_dict['transcript_exon'] = self._create_basic_annotation(str(variant_classfication.get_exon_i()+1))
            final_annotation_dict['variant_classification'] = self._create_basic_annotation(variant_classfication.get_vc())
            final_annotation_dict['secondary_variant_classification'] = self._create_basic_annotation(variant_classfication.get_secondary_vc())
            final_annotation_dict['protein_change'] = self._create_basic_annotation(vcer.generate_protein_change_from_vc(variant_classfication))
            final_annotation_dict['codon_change'] = self._create_basic_annotation(vcer.generate_codon_change_from_vc(chosen_tx, start, end, variant_classfication))
            final_annotation_dict['transcript_change'] = self._create_basic_annotation(vcer.generate_transcript_change_from_tx(chosen_tx, final_annotation_dict['variant_type'].value, variant_classfication, start, end, mutation.ref_allele, mutation.alt_allele))

            final_annotation_dict['transcript_strand'] = self._create_basic_annotation(chosen_tx.get_strand())
            final_annotation_dict['gene'] = self._create_basic_annotation(chosen_tx.get_gene())
            final_annotation_dict['gene_type'] = self._create_basic_annotation(chosen_tx.get_gene_type())
            final_annotation_dict['gencode_transcript_tags'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'tag'))
            final_annotation_dict['gencode_transcript_status'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_status'))
            final_annotation_dict['havana_transcript'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'havana_transcript'))
            final_annotation_dict['ccds_id'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'ccdsid'))
            final_annotation_dict['gencode_transcript_type'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_type'))
            final_annotation_dict['gencode_transcript_name'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_name'))

            other_transcript_value = self._render_other_transcripts(txs, [txs.index(chosen_tx)], final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, mutation.start, mutation.end)
            final_annotation_dict['other_transcripts'] = self._create_basic_annotation(other_transcript_value)
            # final_annotation_dict['gene_id'].value

        mutation.addAnnotations(final_annotation_dict)

        # Add the HGVS annotations ... setting to "" if not available.
        hgvs_dict_annotations = self._create_hgvs_annotation_dict(mutation, chosen_tx)
        mutation.addAnnotations(hgvs_dict_annotations)

        return mutation
Example #3
0
 def _add(self, mutation):
     variant_type = TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele)
     # only combine ONPs, not indels
     if not TranscriptProviderUtils.is_xnp(variant_type):
         self.indel_queue.append(mutation)
     else:
         self.queue[self.sns.getSampleName(mutation)].append(mutation)
Example #4
0
    def generate_codon_change_from_vc(self, t, start, end, vc):
        """

        :param t: (Transcript)
        :param start: (int)
        :param end:  (int)
        :param vc:  (VariantClassification)

        :return:
        """
        dist_from_exon = self._get_splice_site_coordinates(t, start, end, vc.get_exon_i())
        exon_i = vc.get_exon_i()
        if vc.get_vc() == VariantClassification.SPLICE_SITE and vc.get_secondary_vc() == VariantClassification.INTRON:
            return TranscriptProviderUtils.render_intronic_splice_site_codon_change(dist_from_exon, exon_i)

        if vc.get_ref_codon_start_in_exon() == "" or vc.get_ref_codon_end_in_exon() == "":
            return ""

        codon_position_start_cds_space = int(vc.get_ref_codon_start_in_exon()) - int(vc.get_cds_start_in_exon_space())+1
        codon_position_end_cds_space = int(vc.get_ref_codon_end_in_exon()) - int(vc.get_cds_start_in_exon_space())+1

        ref_codon_seq = vc.get_ref_codon()
        alt_codon_seq = vc.get_alt_codon()

        result = TranscriptProviderUtils.render_codon_change(vc.get_vt(), vc.get_vc(), int(codon_position_start_cds_space), int(codon_position_end_cds_space), ref_codon_seq, alt_codon_seq, dist_from_exon, exon_i, vc.get_secondary_vc())
        return result
Example #5
0
    def initializeMutFromAttributes(chr, start, end, ref_allele, alt_allele, build):
        mut = MutationData(str(chr), str(start), str(end), ref_allele, alt_allele, str(build))
        varType = TranscriptProviderUtils.infer_variant_type(mut.ref_allele, mut.alt_allele)

        if TranscriptProviderUtils.is_xnp(varType):  # Snps and other xNPs
            mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="")
        if varType == VariantClassification.VT_DEL:  # deletion
            preceding_bases, updated_ref_allele, updated_start, updated_end =\
                MutUtils.retrievePrecedingBasesForDeletions(mut)
            mut.ref_allele = updated_ref_allele
            mut["ref_allele"] = updated_ref_allele
            mut.alt_allele = "-"
            mut["alt_allele"] = "-"
            mut.start = updated_start
            mut["start"] = updated_start
            mut.end = updated_end
            mut["end"] = updated_end
            mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME,
                                 annotationValue=preceding_bases)
        elif varType == VariantClassification.VT_INS:  # insertion
            preceding_bases, updated_alt_allele, updated_start, updated_end = \
                MutUtils.retrievePrecedingBasesForInsertions(mut)
            mut.ref_allele = "-"
            mut["ref_allele"] = "-"
            mut.alt_allele = updated_alt_allele
            mut["alt_allele"] = updated_alt_allele
            mut.start = updated_start
            mut["start"] = updated_start
            mut.end = updated_end
            mut["end"] = updated_end
            mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME,
                                 annotationValue=preceding_bases)

        return mut
    def annotate_mutation(self, mutation):
        chr = mutation.chr
        start = int(mutation.start)
        end = int(mutation.end)
        txs = self.get_transcripts_by_pos(chr, start, end)
        final_annotation_dict = self._create_blank_set_of_annotations()
        final_annotation_dict['variant_type'] = Annotation(value=TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), datasourceName=self.title)
        chosen_tx = None

        # We have hit IGR if no transcripts come back.  Most annotations can just use the blank set.
        if len(txs) == 0:
            final_annotation_dict['variant_classification'] = self._create_basic_annotation(VariantClassification.IGR)
            nearest_genes = self._get_nearest_genes(chr, int(start), int(end))
            final_annotation_dict['other_transcripts'] = self._create_basic_annotation(value='%s (%s upstream) : %s (%s downstream)' % (nearest_genes[0][0], nearest_genes[0][1], nearest_genes[1][0], nearest_genes[1][1]))
            final_annotation_dict['gene'] = self._create_basic_annotation('Unknown')
            final_annotation_dict['gene_id'] = self._create_basic_annotation('0')
            final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value))
        else:
            # Choose the best effect transcript
            chosen_tx = self._choose_transcript(txs, self.get_tx_mode(), final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, start, end)
            vcer = VariantClassifier()

            final_annotation_dict['annotation_transcript'] = self._create_basic_annotation(chosen_tx.get_transcript_id())
            final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value))
            final_annotation_dict['strand'] = self._create_basic_annotation(chosen_tx.get_strand())

            final_annotation_dict['transcript_position'] = self._create_basic_annotation(TranscriptProviderUtils.render_transcript_position(int(start), int(end), chosen_tx))

            final_annotation_dict['transcript_id'] = self._create_basic_annotation(chosen_tx.get_transcript_id())

            variant_classfication = vcer.variant_classify(tx=chosen_tx, variant_type=final_annotation_dict['variant_type'].value,
                                             ref_allele=mutation.ref_allele, alt_allele=mutation.alt_allele, start=mutation.start, end=mutation.end)
            final_annotation_dict['transcript_exon'] = self._create_basic_annotation(str(variant_classfication.get_exon_i()+1))
            final_annotation_dict['variant_classification'] = self._create_basic_annotation(variant_classfication.get_vc())
            final_annotation_dict['secondary_variant_classification'] = self._create_basic_annotation(variant_classfication.get_secondary_vc())
            final_annotation_dict['protein_change'] = self._create_basic_annotation(vcer.generate_protein_change_from_vc(variant_classfication))
            final_annotation_dict['codon_change'] = self._create_basic_annotation(vcer.generate_codon_change_from_vc(chosen_tx, start, end, variant_classfication))
            final_annotation_dict['transcript_change'] = self._create_basic_annotation(vcer.generate_transcript_change_from_tx(chosen_tx, final_annotation_dict['variant_type'].value, variant_classfication, start, end, mutation.ref_allele, mutation.alt_allele))

            final_annotation_dict['transcript_strand'] = self._create_basic_annotation(chosen_tx.get_strand())
            final_annotation_dict['gene'] = self._create_basic_annotation(chosen_tx.get_gene())
            final_annotation_dict['gene_type'] = self._create_basic_annotation(chosen_tx.get_gene_type())
            final_annotation_dict['gencode_transcript_tags'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'tag'))
            final_annotation_dict['gencode_transcript_status'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_status'))
            final_annotation_dict['havana_transcript'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'havana_transcript'))
            final_annotation_dict['ccds_id'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'ccdsid'))
            final_annotation_dict['gencode_transcript_type'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_type'))
            final_annotation_dict['gencode_transcript_name'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_name'))

            other_transcript_value = self._render_other_transcripts(txs, [txs.index(chosen_tx)], final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, mutation.start, mutation.end)
            final_annotation_dict['other_transcripts'] = self._create_basic_annotation(other_transcript_value)
            # final_annotation_dict['gene_id'].value

        mutation.addAnnotations(final_annotation_dict)

        # Add the HGVS annotations ... setting to "" if not available.
        hgvs_dict_annotations = self._create_hgvs_annotation_dict(mutation, chosen_tx)
        mutation.addAnnotations(hgvs_dict_annotations)

        return mutation
Example #7
0
    def initializeMutFromAttributes(chr, start, end, ref_allele, alt_allele, build, mutation_data_factory=None):
        mutation_data_factory = MutationDataFactory() if mutation_data_factory is None else mutation_data_factory
        mut = mutation_data_factory.create(str(chr), str(start), str(end), ref_allele, alt_allele, str(build))
        varType = TranscriptProviderUtils.infer_variant_type(mut.ref_allele, mut.alt_allele)

        if TranscriptProviderUtils.is_xnp(varType):  # Snps and other xNPs
            mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="")
        if varType == VariantClassification.VT_DEL:  # deletion
            preceding_bases, updated_ref_allele, updated_start, updated_end =\
                MutUtils.retrievePrecedingBasesForDeletions(mut)
            mut.ref_allele = updated_ref_allele
            mut["ref_allele"] = updated_ref_allele
            mut.alt_allele = "-"
            mut["alt_allele"] = "-"
            mut.start = updated_start
            mut["start"] = updated_start
            mut.end = updated_end
            mut["end"] = updated_end
            mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME,
                                 annotationValue=preceding_bases)
        elif varType == VariantClassification.VT_INS:  # insertion
            preceding_bases, updated_alt_allele, updated_start, updated_end = \
                MutUtils.retrievePrecedingBasesForInsertions(mut)
            mut.ref_allele = "-"
            mut["ref_allele"] = "-"
            mut.alt_allele = updated_alt_allele
            mut["alt_allele"] = updated_alt_allele
            mut.start = updated_start
            mut["start"] = updated_start
            mut.end = updated_end
            mut["end"] = updated_end
            mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME,
                                 annotationValue=preceding_bases)

        return mut
Example #8
0
 def _determine_codon_overlap(self, s, e, codon_tuple, variant_type):
     if codon_tuple is None:
         return False
     if variant_type == VariantClassification.VT_INS:
         is_codon_overlap = TranscriptProviderUtils.test_overlap(s, s, codon_tuple[0]+1, codon_tuple[1])
     else:
         is_codon_overlap = TranscriptProviderUtils.test_overlap(s, e, codon_tuple[0]+1, codon_tuple[1])
     return is_codon_overlap
Example #9
0
 def _add(self, mutation):
     variant_type = TranscriptProviderUtils.infer_variant_type(
         mutation.ref_allele, mutation.alt_allele)
     # only combine ONPs, not indels
     if not TranscriptProviderUtils.is_xnp(variant_type):
         self.indel_queue.append(mutation)
     else:
         self.queue[self.sns.getSampleName(mutation)].append(mutation)
Example #10
0
 def _determine_if_cds_overlap(self, s, e, tx, variant_type):
     if variant_type == VariantClassification.VT_INS:
         is_cds_overlap = TranscriptProviderUtils.test_feature_overlap(
             s, s, tx.get_cds()) != -1
     else:
         is_cds_overlap = TranscriptProviderUtils.test_feature_overlap(
             s, e, tx.get_cds()) != -1
     return is_cds_overlap
Example #11
0
 def _determine_codon_overlap(self, s, e, codon_tuple, variant_type):
     if codon_tuple is None:
         return False
     if variant_type == VariantClassification.VT_INS:
         is_codon_overlap = TranscriptProviderUtils.test_overlap(
             s, s, codon_tuple[0] + 1, codon_tuple[1])
     else:
         is_codon_overlap = TranscriptProviderUtils.test_overlap(
             s, e, codon_tuple[0] + 1, codon_tuple[1])
     return is_codon_overlap
    def test_codon_single_base(self, start, end, ref_base_stranded, gt_codon):
        """Test that we can grab the proper three bases of a codon for an arbitrary single base """
        tx = self.retrieve_test_transcript_MAPK1()
        transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx)
        cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(tx)
        protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions(transcript_position_start, transcript_position_end, cds_start)
        cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(protein_position_start, protein_position_end, cds_start)

        codon_seq = tx.get_seq()[cds_codon_start:cds_codon_end+1]
        self.assertTrue(codon_seq == gt_codon, "Did not get correct codon (%s): %s    loc: %s-%s" %(gt_codon, codon_seq, start, end))
Example #13
0
    def _determine_if_splice_site_overlap(self,
                                          start_genomic_space,
                                          end_genomic_space,
                                          tx,
                                          variant_type,
                                          dist=2):
        """

        Overlap of start and stop codon (i.e. start of first exon and end of last exon -- stranded) will not be a
            Splice_Site.  This method will return is_splice_site_overlap of False

         If overlap is detected, but the start or end is within dist bp, then this is a splice site.
         start <= end
        INS events only call splice site when they start in the splice site

        :param start_genomic_space: int in genomic space
        :param end_genomic_space: int in genomic space
        :param tx: Transcript
        :param variant_type:
        :param dist:
        :return is_splice_site_overlap, exon_i, is_right_overlap (Higher genomic position --> True)

        """
        exons = tx.get_exons()
        strand = tx.get_strand()

        # If this is an insertion, we only want to count a splice site if it starts in the splice site regions
        if variant_type == VariantClassification.VT_INS:
            end_genomic_space = start_genomic_space

        for i, exon in enumerate(exons):
            is_internal_exon = (i > 0) and (i < (len(exons) - 1))
            is_check_left = is_internal_exon or (strand == "-" and i == 0) or (
                strand == "+" and i == (len(exons) - 1))
            is_check_right = is_internal_exon or (
                strand == "+" and i == 0) or (strand == "-"
                                              and i == (len(exons) - 1))
            if is_check_left:
                splice_site_left = (exon[0] - dist + 1,
                                    exon[0] + (dist - 1) + 1)
                overlap_type_left = TranscriptProviderUtils.test_overlap(
                    start_genomic_space, end_genomic_space,
                    splice_site_left[0], splice_site_left[1])
                if overlap_type_left:
                    return True, i, False
            if is_check_right:
                splice_site_right = (exon[1] - (dist - 1), exon[1] + dist)
                overlap_type_right = TranscriptProviderUtils.test_overlap(
                    start_genomic_space, end_genomic_space,
                    splice_site_right[0], splice_site_right[1])
                if overlap_type_right:
                    return True, i, True

        return False, -1, None, False
Example #14
0
    def _determine_de_novo_old(self, vc, transcript_position_start,
                               transcript_position_end, ref, alt, tx,
                               variant_type):
        """Returns input vc if not de Novo.  Otherwise, returns updated variant classification.

        :param vc: Current variant classification.  Note that if this is not 5'UTR, this method will just return this input.
        :param transcript_position_start:
        :param transcript_position_end:
        :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele)
        :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele)
        :param tx: transcript
        :param variant_type:
         Will always return original vc if the vc is not None."""
        result = vc

        if vc == VariantClassification.FIVE_PRIME_UTR and ref != alt:
            observed_allele_stranded = self._determine_stranded_allele(
                alt, tx.get_strand())
            reference_allele_stranded = self._determine_stranded_allele(
                ref, tx.get_strand())
            tx_seq = tx.get_seq()

            if variant_type == VariantClassification.VT_INS:
                if tx.get_strand() == "-":
                    transcript_position_start = transcript_position_end
                else:
                    transcript_position_end = transcript_position_start
            utr_region_start, utr_region_end = transcript_position_start - 2, transcript_position_end + 2
            # TODO: This may not work for "+" strand.  Need unit test.
            utr_region_seq = tx_seq[utr_region_start:utr_region_end + 1]

            mutated_utr_region_seq = TranscriptProviderUtils.mutate_reference_sequence(
                utr_region_seq, utr_region_start, transcript_position_start,
                transcript_position_end, observed_allele_stranded,
                variant_type)
            # Check for Denovo
            ATG_position = mutated_utr_region_seq.find('ATG')
            if ATG_position > -1:
                cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(
                    tx)

                ATG_position = utr_region_start + ATG_position + 1
                if (cds_start_in_exon_space - ATG_position) % 3 == 0:
                    frameness = 'InFrame'
                else:
                    frameness = 'OutOfFrame'
                result = 'De_novo_Start_' + frameness
        return result
Example #15
0
    def _determine_de_novo(self,
                           vc_str,
                           exon_start,
                           ref,
                           alt,
                           tx,
                           variant_type,
                           buffer=2):
        """Returns input vc if not de Novo.  Otherwise, returns updated variant classification.

        :param exon_start:
        :param buffer:
        :param vc_str: Current variant classification.  Note that if this is not 5'UTR, this method will just return this input.
        :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele)
        :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele)
        :param tx: transcript
        :param variant_type:
         Will always return original vc if the vc is not None."""
        result = vc_str
        if vc_str == VariantClassification.FIVE_PRIME_UTR and ref != alt:
            mutated_utr_region = self._mutate_exon(tx, ref, alt, variant_type,
                                                   exon_start, buffer)
            atg_position = mutated_utr_region.find('ATG')
            if atg_position > -1:
                atg_exon_position = exon_start + atg_position - buffer
                cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(
                    tx)
                if (cds_start_in_exon_space - atg_exon_position) % 3 == 0:
                    frameness = 'InFrame'
                else:
                    frameness = 'OutOfFrame'
                result = 'De_novo_Start_' + frameness

        return result
Example #16
0
    def _determine_de_novo(self, vc_str, exon_start, ref, alt, tx, variant_type, buffer=2 ):
        """Returns input vc if not de Novo.  Otherwise, returns updated variant classification.

        :param exon_start:
        :param buffer:
        :param vc_str: Current variant classification.  Note that if this is not 5'UTR, this method will just return this input.
        :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele)
        :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele)
        :param tx: transcript
        :param variant_type:
         Will always return original vc if the vc is not None."""
        result = vc_str
        if vc_str == VariantClassification.FIVE_PRIME_UTR and ref != alt:
            mutated_utr_region = self._mutate_exon(tx, ref, alt, variant_type, exon_start, buffer)
            atg_position = mutated_utr_region.find('ATG')
            if atg_position > -1:
                atg_exon_position = exon_start + atg_position - buffer
                cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx)
                if (cds_start_in_exon_space - atg_exon_position) % 3 == 0:
                    frameness = 'InFrame'
                else:
                    frameness = 'OutOfFrame'
                result = 'De_novo_Start_' + frameness

        return result
    def _is_matching(self, mut, tsv_record):

        chrom = tsv_record[self.tsv_index["chrom"]]
        startPos = tsv_record[self.tsv_index["start"]]
        endPos = tsv_record[self.tsv_index["end"]]
        build = "hg19"

        if self.match_mode == "exact":
            if "ref" in self.tsv_index and "alt" in self.tsv_index:  # ref and alt information is present
                ref = tsv_record[self.tsv_index["ref"]]
                alt = tsv_record[self.tsv_index["alt"]]
                if ref == "-" or alt == "-":  # addresses Mutation Annotation Format based tsv records

                    # TODO: This looks risky to be calling the MutationData constructor directly
                    ds_mut = MutationData(chrom, startPos, endPos, ref, alt, build)
                else:  # addresses tsv records where the input isn't a Mutation Annotation Format file
                    ds_mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build)

                if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \
                    and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \
                    and int(mut.end) == int(ds_mut.end):
                    return True
            else:  # do not use ref and alt information
                if mut.chr == chrom and int(mut.start) == int(startPos) and int(mut.end) == int(endPos):
                    return True
        else:
           return TranscriptProviderUtils.test_overlap(int(mut.start), int(mut.end), int(startPos), int(endPos))
        return False
    def _is_matching(self, mut, tsv_record):

        chrom = tsv_record[self.tsv_index["chrom"]]
        startPos = tsv_record[self.tsv_index["start"]]
        endPos = tsv_record[self.tsv_index["end"]]
        build = "hg19"

        if self.match_mode == "exact":
            if "ref" in self.tsv_index and "alt" in self.tsv_index:  # ref and alt information is present
                ref = tsv_record[self.tsv_index["ref"]]
                alt = tsv_record[self.tsv_index["alt"]]
                if ref == "-" or alt == "-":  # addresses Mutation Annotation Format based tsv records

                    # TODO: This looks risky to be calling the MutationData constructor directly
                    ds_mut = MutationData(chrom, startPos, endPos, ref, alt,
                                          build)
                else:  # addresses tsv records where the input isn't a Mutation Annotation Format file
                    ds_mut = MutUtils.initializeMutFromAttributes(
                        chrom, startPos, endPos, ref, alt, build)

                if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \
                    and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \
                    and int(mut.end) == int(ds_mut.end):
                    return True
            else:  # do not use ref and alt information
                if mut.chr == chrom and int(
                        mut.start) == int(startPos) and int(
                            mut.end) == int(endPos):
                    return True
        else:
            return TranscriptProviderUtils.test_overlap(
                int(mut.start), int(mut.end), int(startPos), int(endPos))
        return False
    def _choose_best_effect_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end):
        """Choose the transcript with the most detrimental effect.
         The rankings are in TranscriptProviderUtils.
         Ties are broken by which transcript has the longer coding length.

        :param list txs: list of Transcript
        :param str variant_type:
        :param str ref_allele:
        :param str alt_allele:
        :param str start:
        :param str end:
        :return Transcript:
         """
        vcer = VariantClassifier()
        effect_dict = TranscriptProviderUtils.retrieve_effect_dict()
        best_effect_score = 100000000 # lower score is more likely to get picked
        best_effect_tx = None
        for tx in txs:
            if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"):
                vc = VariantClassification.SILENT
            else:
                vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc()
            effect_score = effect_dict.get(vc, 25)
            if effect_score < best_effect_score:
                best_effect_score = effect_score
                best_effect_tx = tx
            elif (effect_score == best_effect_score) and (len(best_effect_tx.get_seq()) < len(tx.get_seq())):
                best_effect_score = effect_score
                best_effect_tx = tx

        return best_effect_tx
    def test_convert_genomic_space_to_transcript_space(self):
        base_config_location = "testdata/ensembl/saccer/"
        ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location)

        tx = ensembl_ds.get_overlapping_transcripts("I", "350", "350") # transcript starts at 335.
        start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("350", "350", tx[0])
        self.assertTrue(start == end)
        self.assertTrue(start == 16)

        tx = ensembl_ds.get_overlapping_transcripts("II", "764690", "764690") # transcript starts at 764697 (strand is '-').
        start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764690", "764690", tx[0])
        self.assertTrue(start == end)
        self.assertTrue(start == 7)

        start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764680", "764690", tx[0])
        self.assertTrue(start == (end - 10))
        self.assertTrue(start == 7)
    def test_seq(self, start, end, gt):
        """Test that we can successfully determine the codon at an arbitrary location on test transcript"""
        tx = self.retrieve_test_transcript_MAPK1()

        transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx)
        transcript_seq = tx.get_seq()
        seq = transcript_seq[transcript_position_start:transcript_position_end+1]
        self.assertTrue(seq == gt, "Incorrect seq found guess,gt (%s, %s)" %(seq, gt))
 def test_mutate_reference_seqeunce(self, vt, start, end, ref, alt, start_exon_space, end_exon_space, mutated_seq_gt):
     """ Test that we can render a mutated sequence with SNP, INS, and DEL
     """
     # mutated_seq_gt is stranded and this is a "-" transcript
     tx = self.retrieve_test_transcript_MAPK1()
     observed_allele = Bio.Seq.reverse_complement(alt)
     mutated_allele = TranscriptProviderUtils.mutate_reference_sequence(tx.get_seq()[start_exon_space : end_exon_space+1], start_exon_space, start_exon_space, end_exon_space, observed_allele, vt)
     self.assertTrue(mutated_seq_gt == mutated_allele, "No match (gt/guess)  %s/%s for %s." % (mutated_seq_gt, mutated_allele, str([vt, start, end, ref, alt, start_exon_space, end_exon_space, mutated_seq_gt])))
Example #23
0
    def _calculate_protein_sequence(self, exons, seq, cds_start_genomic_space, cds_stop_genomic_space, strand):
        cds_start_exon_space, cds_stop_exon_space = TranscriptProviderUtils._convert_genomic_space_to_feature_space(int(cds_start_genomic_space), int(cds_stop_genomic_space), exons, strand)

        prot_seq = MutUtils.translate_sequence(seq[int(cds_start_exon_space):int(cds_stop_exon_space)])
        if len(prot_seq) > 0 and prot_seq[-1] == '*':
            prot_seq = prot_seq[:-1]

        return prot_seq
 def _calculate_effect_score(tx, start, end, alt_allele, ref_allele, variant_type):
     """Compute the effect score"""
     effect_dict = TranscriptProviderUtils.retrieve_effect_dict()
     vcer = VariantClassifier()
     if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"):
         vc = VariantClassification.SILENT
     else:
         vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc()
     effect_score = effect_dict.get(vc, 25)
     return effect_score
 def _calculate_effect_score(tx, start, end, alt_allele, ref_allele, variant_type):
     """Compute the effect score"""
     effect_dict = TranscriptProviderUtils.retrieve_effect_dict()
     vcer = VariantClassifier()
     if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"):
         vc = VariantClassification.SILENT
     else:
         vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc()
     effect_score = effect_dict.get(vc, 25)
     return effect_score
Example #26
0
    def _determine_if_splice_site_overlap(self, start_genomic_space, end_genomic_space, tx, variant_type, dist=2):

        """

        Overlap of start and stop codon (i.e. start of first exon and end of last exon -- stranded) will not be a
            Splice_Site.  This method will return is_splice_site_overlap of False

         If overlap is detected, but the start or end is within dist bp, then this is a splice site.
         start <= end
        INS events only call splice site when they start in the splice site

        :param start_genomic_space: int in genomic space
        :param end_genomic_space: int in genomic space
        :param tx: Transcript
        :param variant_type:
        :param dist:
        :return is_splice_site_overlap, exon_i, is_right_overlap (Higher genomic position --> True)

        """
        exons = tx.get_exons()
        strand = tx.get_strand()

        # If this is an insertion, we only want to count a splice site if it starts in the splice site regions
        if variant_type == VariantClassification.VT_INS:
            end_genomic_space = start_genomic_space

        for i,exon in enumerate(exons):
            is_internal_exon = (i > 0) and (i < (len(exons)-1))
            is_check_left = is_internal_exon or (strand == "-" and i == 0) or (strand == "+" and i == (len(exons)-1))
            is_check_right = is_internal_exon or (strand == "+" and i == 0) or (strand == "-" and i == (len(exons)-1))
            if is_check_left:
                splice_site_left = (exon[0]-dist+1, exon[0]+(dist-1)+1)
                overlap_type_left = TranscriptProviderUtils.test_overlap(start_genomic_space, end_genomic_space, splice_site_left[0], splice_site_left[1])
                if overlap_type_left:
                    return True, i, False
            if is_check_right:
                splice_site_right = (exon[1]-(dist-1), exon[1] + dist)
                overlap_type_right = TranscriptProviderUtils.test_overlap(start_genomic_space, end_genomic_space, splice_site_right[0], splice_site_right[1])
                if overlap_type_right:
                    return True, i, True

        return False, -1, None, False
Example #27
0
    def generate_transcript_change_from_tx(self, tx, variant_type, vc,
                                           start_genomic_space,
                                           end_genomic_space, ref_allele,
                                           alt_allele):
        """

        :param vc:
        :return:
        """

        if vc.get_vc(
        ) == VariantClassification.SPLICE_SITE and vc.get_secondary_vc(
        ) == VariantClassification.INTRON:
            return ""
            # dist_from_exon = self._get_splice_site_coordinates(tx, start_genomic_space, end_genomic_space, vc.get_exon_i())
            # exon_i = vc.get_exon_i()
            # return TranscriptProviderUtils.render_splice_site_transcript_change(tx, dist_from_exon, exon_i, vc.get_secondary_vc() == VariantClassification.INTRON)

        if vc.get_cds_start_in_exon_space(
        ) == "" or vc.get_cds_start_in_exon_space() < 0:
            return ""
        exon_position_start, exon_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(
            int(start_genomic_space), int(end_genomic_space), tx)

        if tx.get_strand() == "-":
            cds_position_start_cds_space = exon_position_start - int(
                vc.get_cds_start_in_exon_space()) + 1
            cds_position_end_cds_space = exon_position_end - int(
                vc.get_cds_start_in_exon_space()) + 1
        else:
            cds_position_start_cds_space = exon_position_start - int(
                vc.get_cds_start_in_exon_space())
            cds_position_end_cds_space = exon_position_end - int(
                vc.get_cds_start_in_exon_space())

        observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(
            ref_allele, alt_allele, tx)
        result = TranscriptProviderUtils.render_transcript_change(
            variant_type, vc.get_vc(), cds_position_start_cds_space,
            cds_position_end_cds_space, reference_allele_stranded,
            observed_allele_stranded, vc.get_secondary_vc())
        return result
Example #28
0
    def _determine_de_novo_old(self, vc, transcript_position_start, transcript_position_end, ref, alt, tx, variant_type):
        """Returns input vc if not de Novo.  Otherwise, returns updated variant classification.

        :param vc: Current variant classification.  Note that if this is not 5'UTR, this method will just return this input.
        :param transcript_position_start:
        :param transcript_position_end:
        :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele)
        :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele)
        :param tx: transcript
        :param variant_type:
         Will always return original vc if the vc is not None."""
        result = vc

        if vc == VariantClassification.FIVE_PRIME_UTR and ref != alt:
            observed_allele_stranded = self._determine_stranded_allele(alt, tx.get_strand())
            reference_allele_stranded = self._determine_stranded_allele(ref, tx.get_strand())
            tx_seq = tx.get_seq()

            if variant_type == VariantClassification.VT_INS:
                if tx.get_strand() == "-":
                    transcript_position_start = transcript_position_end
                else:
                    transcript_position_end = transcript_position_start
            utr_region_start, utr_region_end = transcript_position_start-2, transcript_position_end+2
            # TODO: This may not work for "+" strand.  Need unit test.
            utr_region_seq = tx_seq[utr_region_start:utr_region_end+1]

            mutated_utr_region_seq = TranscriptProviderUtils.mutate_reference_sequence(utr_region_seq, utr_region_start,
                transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type)
            # Check for Denovo
            ATG_position = mutated_utr_region_seq.find('ATG')
            if ATG_position > -1:
                cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx)

                ATG_position = utr_region_start + ATG_position + 1
                if (cds_start_in_exon_space - ATG_position) % 3 == 0:
                    frameness = 'InFrame'
                else:
                    frameness = 'OutOfFrame'
                result = 'De_novo_Start_' + frameness
        return result
    def _calculate_protein_sequence(self, exons, seq, cds_start_genomic_space,
                                    cds_stop_genomic_space, strand):
        cds_start_exon_space, cds_stop_exon_space = TranscriptProviderUtils._convert_genomic_space_to_feature_space(
            int(cds_start_genomic_space), int(cds_stop_genomic_space), exons,
            strand)

        prot_seq = MutUtils.translate_sequence(
            seq[int(cds_start_exon_space):int(cds_stop_exon_space)])
        if len(prot_seq) > 0 and prot_seq[-1] == '*':
            prot_seq = prot_seq[:-1]

        return prot_seq
    def _extract_exon_info(self, position, tx):
        """
        Create basic information about the given position relative to the transcript.

        :param int position: in genomic space
        :param Transcript tx:
         :return tuple:
            [0]: closest exon index of the position (0-based),
             [1]: whether the distance was left in genomic space (false for overlap)
             [2]: whether the position overlaps an exon

        """
        exon_index = TranscriptProviderUtils.determine_closest_exon(tx, position, position)
        if exon_index is None:
            return exon_index, None, None, None
        left_distance, right_distance = TranscriptProviderUtils.determine_closest_distance_from_exon(position, position,
                                                                                                     exon_index, tx)
        is_in_exon = (left_distance <= 0) and (right_distance >= 0)
        is_diff_is_positive = (left_distance > 0) and (right_distance > 0)
        is_negative_strand = (tx.get_strand() == "-")
        return exon_index, is_diff_is_positive, is_in_exon, is_negative_strand
Example #31
0
    def __get_overlapping_records(self, records, start, end, type):
        if type == 'gene':
            st_key, en_key = 'start', 'end'
        elif type == 'transcript':
            st_key, en_key = 'footprint_start', 'footprint_end'

        out_records = list()
        for r in records:
            if TranscriptProviderUtils.test_overlap(start, end, r[st_key], r[en_key]):
                out_records.append(r)

        return out_records
Example #32
0
    def __get_overlapping_records(self, records, start, end, type):
        if type == "gene":
            st_key, en_key = "start", "end"
        elif type == "transcript":
            st_key, en_key = "footprint_start", "footprint_end"

        out_records = list()
        for r in records:
            if TranscriptProviderUtils.test_overlap(start, end, r[st_key], r[en_key]):
                out_records.append(r)

        return out_records
    def _extract_exon_info(self, position, tx):
        """
        Create basic information about the given position relative to the transcript.

        :param int position: in genomic space
        :param Transcript tx:
         :return tuple:
            [0]: closest exon index of the position (0-based),
             [1]: whether the distance was left in genomic space (false for overlap)
             [2]: whether the position overlaps an exon

        """
        exon_index = TranscriptProviderUtils.determine_closest_exon(tx, position, position)
        if exon_index is None:
            return exon_index, None, None, None
        left_distance, right_distance = TranscriptProviderUtils.determine_closest_distance_from_exon(position, position,
                                                                                                     exon_index, tx)
        is_in_exon = (left_distance <= 0) and (right_distance >= 0)
        is_diff_is_positive = (left_distance > 0) and (right_distance > 0)
        is_negative_strand = (tx.get_strand() == "-")
        return exon_index, is_diff_is_positive, is_in_exon, is_negative_strand
Example #34
0
    def __get_overlapping_records(self, records, start, end, type):
        if type == 'gene':
            st_key, en_key = 'start', 'end'
        elif type == 'transcript':
            st_key, en_key = 'footprint_start', 'footprint_end'

        out_records = list()
        for r in records:
            if TranscriptProviderUtils.test_overlap(start, end, r[st_key],
                                                    r[en_key]):
                out_records.append(r)

        return out_records
Example #35
0
    def annotate_mutation(self, mutation, upstream_padding=3000, downstream_padding=0):
        mutation.createAnnotation('variant_type', TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), self.title)
        data = [mutation]
        data = gaf_annotation.find_mut_in_gaf(data, self)
        data = gaf_annotation.identify_best_effect_transcript(data, self)
        data = gaf_annotation.identify_best_canonical_transcript(data, self)
        data = gaf_annotation.correct_transcript_coordinates(data, self)
        data = gaf_annotation.infer_output_fields(data, self)

        data = self._annotateMutationFromTranscripts(data)

        annotated_mutation = data.next()
        return annotated_mutation
Example #36
0
    def generate_codon_change_from_vc(self, t, start, end, vc):
        """

        :param t: (Transcript)
        :param start: (int)
        :param end:  (int)
        :param vc:  (VariantClassification)

        :return:
        """
        dist_from_exon = self._get_splice_site_coordinates(
            t, start, end, vc.get_exon_i())
        exon_i = vc.get_exon_i()
        if vc.get_vc(
        ) == VariantClassification.SPLICE_SITE and vc.get_secondary_vc(
        ) == VariantClassification.INTRON:
            return TranscriptProviderUtils.render_intronic_splice_site_codon_change(
                dist_from_exon, exon_i)

        if vc.get_ref_codon_start_in_exon(
        ) == "" or vc.get_ref_codon_end_in_exon() == "":
            return ""

        codon_position_start_cds_space = int(
            vc.get_ref_codon_start_in_exon()) - int(
                vc.get_cds_start_in_exon_space()) + 1
        codon_position_end_cds_space = int(
            vc.get_ref_codon_end_in_exon()) - int(
                vc.get_cds_start_in_exon_space()) + 1

        ref_codon_seq = vc.get_ref_codon()
        alt_codon_seq = vc.get_alt_codon()

        result = TranscriptProviderUtils.render_codon_change(
            vc.get_vt(), vc.get_vc(), int(codon_position_start_cds_space),
            int(codon_position_end_cds_space), ref_codon_seq, alt_codon_seq,
            dist_from_exon, exon_i, vc.get_secondary_vc())
        return result
Example #37
0
    def generate_protein_change_from_vc(self, vc):
        """

        :param vc: VariantClassification
        :return:
        """
        prot_position_start = vc.get_ref_protein_start()
        prot_position_end = vc.get_ref_protein_end()
        if prot_position_start == "" or prot_position_end == "":
            return ""
        ref_prot_allele = vc.get_ref_aa()
        alt_prot_allele = vc.get_alt_aa()
        result = TranscriptProviderUtils.render_protein_change(vc.get_vt(), vc.get_vc(), int(prot_position_start), int(prot_position_end), ref_prot_allele, alt_prot_allele, vc.get_secondary_vc())
        return result
Example #38
0
    def _get_splice_site_coordinates(self, t, start, end, exon_i):
        """Returns distance from exon."""

        left_diff, right_diff = TranscriptProviderUtils.determine_closest_distance_from_exon(start, end, exon_i,  t)

        if abs(left_diff) < abs(right_diff):
            dist_from_exon = left_diff * -1
            if dist_from_exon > -1: dist_from_exon = -1
        elif abs(right_diff) < abs(left_diff):
            dist_from_exon = right_diff * -1
            if dist_from_exon < 1: dist_from_exon = 1
        else:
            dist_from_exon = 0

        if t.get_strand() == "-":
            dist_from_exon *= -1
        return dist_from_exon
Example #39
0
    def generate_protein_change_from_vc(self, vc):
        """

        :param vc: VariantClassification
        :return:
        """
        prot_position_start = vc.get_ref_protein_start()
        prot_position_end = vc.get_ref_protein_end()
        if prot_position_start == "" or prot_position_end == "":
            return ""
        ref_prot_allele = vc.get_ref_aa()
        alt_prot_allele = vc.get_alt_aa()
        result = TranscriptProviderUtils.render_protein_change(
            vc.get_vt(), vc.get_vc(), int(prot_position_start),
            int(prot_position_end), ref_prot_allele, alt_prot_allele,
            vc.get_secondary_vc())
        return result
Example #40
0
    def _get_splice_site_coordinates(self, t, start, end, exon_i):
        """Returns distance from exon."""

        left_diff, right_diff = TranscriptProviderUtils.determine_closest_distance_from_exon(
            start, end, exon_i, t)

        if abs(left_diff) < abs(right_diff):
            dist_from_exon = left_diff * -1
            if dist_from_exon > -1: dist_from_exon = -1
        elif abs(right_diff) < abs(left_diff):
            dist_from_exon = right_diff * -1
            if dist_from_exon < 1: dist_from_exon = 1
        else:
            dist_from_exon = 0

        if t.get_strand() == "-":
            dist_from_exon *= -1
        return dist_from_exon
    def test_convert_genomic_space_to_exon_space(self, loc, gt_d):
        """Test genomic --> exon transform on real data. """
        gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa"
        base_output_filename = "out/test_variant_classification"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
        ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, version="TEST")
        tx = ensembl_ds.get_overlapping_transcripts("22", "22108790", "22108790")

        start, end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(loc[0], loc[1], tx[0])
        loc_length = (int(loc[1]) - int(loc[0]))
        self.assertTrue((end - start) == loc_length, str(end) + " - " + str(start) + " was not correct length: " + str(loc_length))
        self.assertTrue(start == gt_d, "start position (" + str(start) + ") did not match gt (" + str(end) + ")" + "   exons: " + str(tx[0].get_exons()))
Example #42
0
    def test_querying_transcripts_by_region(self):
        """Test web api backend call /transcripts/.... """
        datasource_list = DatasourceFactory.createDatasources(
            self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)
        txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411)
        self.assertTranscriptsFound(txs)

        ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt
        # None of these values are validated.
        for tx in txs:
            transcript_id = tx.get_transcript_id()
            tx_start = tx.determine_transcript_start()
            tx_end = tx.determine_transcript_stop()
            gene = tx.get_gene()
            chr = tx.get_contig()
            n_exons = len(tx.get_exons())
            strand = tx.get_strand()
            footprint_start, footprint_end = tx.determine_cds_footprint()
            klass = tx.get_gene_type()
            cds_start = tx.determine_cds_start()
            cds_end = tx.determine_cds_stop()
            id = tx.get_gene_id()
            genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()]
            transcript_coords = [[
                TranscriptProviderUtils.convert_genomic_space_to_exon_space(
                    exon[0] + 1, exon[1], tx)
            ] for exon in tx.get_exons()]
            code_len = int(cds_end) - int(cds_start) + 1

            # If refseq datasources are not available, this will fail.
            # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations.
            dummy_mut = annotator.annotate_transcript(tx)
            refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"]
            refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"]

            # Description is unavailable right now
            description = ""

            self.assertTrue(refseq_mRNA_id is not None)
            self.assertTrue(refseq_prot_id is not None)
            self.assertTrue(len(transcript_coords) == n_exons)
Example #43
0
    def test_querying_transcripts_by_region(self):
        """Test web api backend call /transcripts/.... """
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)
        txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411)
        self.assertTranscriptsFound(txs)

        ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt
        # None of these values are validated.
        for tx in txs:
            transcript_id = tx.get_transcript_id()
            tx_start = tx.determine_transcript_start()
            tx_end = tx.determine_transcript_stop()
            gene = tx.get_gene()
            chr = tx.get_contig()
            n_exons = len(tx.get_exons())
            strand = tx.get_strand()
            footprint_start, footprint_end = tx.determine_cds_footprint()
            klass = tx.get_gene_type()
            cds_start = tx.determine_cds_start()
            cds_end = tx.determine_cds_stop()
            id = tx.get_gene_id()
            genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()]
            transcript_coords = [
                [TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0] + 1, exon[1], tx)]
                for exon in tx.get_exons()
            ]
            code_len = int(cds_end) - int(cds_start) + 1

            # If refseq datasources are not available, this will fail.
            # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations.
            dummy_mut = annotator.annotate_transcript(tx)
            refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"]
            refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"]

            # Description is unavailable right now
            description = ""

            self.assertTrue(refseq_mRNA_id is not None)
            self.assertTrue(refseq_prot_id is not None)
            self.assertTrue(len(transcript_coords) == n_exons)
Example #44
0
    def annotate_mutation(self,
                          mutation,
                          upstream_padding=3000,
                          downstream_padding=0):
        mutation.createAnnotation(
            'variant_type',
            TranscriptProviderUtils.infer_variant_type(mutation.ref_allele,
                                                       mutation.alt_allele),
            self.title)
        data = [mutation]
        data = gaf_annotation.find_mut_in_gaf(data, self)
        data = gaf_annotation.identify_best_effect_transcript(data, self)
        data = gaf_annotation.identify_best_canonical_transcript(data, self)
        data = gaf_annotation.correct_transcript_coordinates(data, self)
        data = gaf_annotation.infer_output_fields(data, self)

        data = self._annotateMutationFromTranscripts(data)

        annotated_mutation = data.next()
        return annotated_mutation
 def _get_overlapping_transcript_records(self, records, start, end):
     return [
         r for r in records if TranscriptProviderUtils.test_overlap(
             int(start), int(end), r.get_start(), r.get_end())
     ]
Example #46
0
    def variant_classify(self, tx, ref_allele, alt_allele, start, end, variant_type, dist=2):
        """Perform classifications.

        Everything handled in genomic space

        *RNA*
        x'UTR
        Splice_Site (Intron)
        Intron
        Splice_Site (Exon)
        {Missense, Silent}
        {Nonsense, Silent}
        {Nonstop, Silent}
        IGR
        x'Flank
        De_novo_Start

        """
        gene_type = tx.get_gene_type()
        if gene_type != "protein_coding":
            if gene_type == VariantClassification.LINCRNA:
                return VariantClassification(VariantClassification.LINCRNA, variant_type, tx.get_transcript_id())
            else:
                return VariantClassification(VariantClassification.RNA, variant_type, tx.get_transcript_id())

        if ref_allele == "-":
            ref_allele = ""
        if alt_allele == "-":
            alt_allele = ""

        s = int(start)
        e = int(end)
        is_exon_overlap = TranscriptProviderUtils.determine_if_exon_overlap(s, e, tx, variant_type)

        is_splice_site_tuple = self._determine_if_splice_site_overlap(s, e, tx, variant_type, dist)
        is_splice_site = is_splice_site_tuple[0]

        is_beyond_exons, side, is_flank = self._determine_beyond_exon_info_vt(start, end, tx, variant_type)

        if not is_exon_overlap and not is_beyond_exons:
            exon_i = TranscriptProviderUtils.determine_closest_exon(tx, int(start), int(end))
            if is_splice_site:
                # Intron Splice Site
                return VariantClassification(VariantClassification.SPLICE_SITE, variant_type, tx.get_transcript_id(), vc_secondary=VariantClassification.INTRON, exon_i=exon_i)
            else:
                return VariantClassification(VariantClassification.INTRON, variant_type, tx.get_transcript_id(), exon_i=exon_i)

        if not is_exon_overlap and is_beyond_exons:
            if is_flank:
                # Flanks
                if side.startswith("3"):
                    return VariantClassification(VariantClassification.THREE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id())
                else:
                    return VariantClassification(VariantClassification.FIVE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id())

            else:
                # IGR
                return VariantClassification(VariantClassification.IGR, variant_type)

        is_start_codon_overlap = self._determine_codon_overlap(s, e, tx.get_start_codon(), variant_type)
        is_stop_codon_overlap = self._determine_codon_overlap(s, e, tx.get_stop_codon(), variant_type)

        if is_start_codon_overlap and not variant_type.endswith("NP"):
            return VariantClassification('Start_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id())
        if is_stop_codon_overlap and not variant_type.endswith("NP"):
            return VariantClassification('Stop_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id())

        is_cds_overlap = self._determine_if_cds_overlap(s, e, tx, variant_type)
        if is_exon_overlap and not is_cds_overlap and not is_start_codon_overlap and not is_stop_codon_overlap:
            # UTR
            if side.startswith("3"):
                vc_tmp = VariantClassification.THREE_PRIME_UTR
            else:
                vc_tmp = VariantClassification.FIVE_PRIME_UTR
            transcript_position_exon_space_start, transcript_position_exon_space_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx)
            vc = self._determine_de_novo(vc_tmp, transcript_position_exon_space_start, ref_allele, alt_allele, tx, variant_type)
            return VariantClassification(vc, variant_type, transcript_id=tx.get_transcript_id(), )

        # We have a clean overlap in the CDS.  Includes start codon or stop codon.
        if is_cds_overlap or is_stop_codon_overlap or is_start_codon_overlap:
            is_frameshift_indel = self.is_frameshift_indel(variant_type, int(start), int(end), alt_allele)
            return self._determine_vc_for_cds_overlap(start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon_overlap)

        raise ValueError("Could not determine variant classification:  " + tx.get_trancript_id() + " " + str([ref_allele, alt_allele, start, end]))
    def test_determine_closest_distance_from_exon_in_exon(self):
        tx = self.retrieve_test_transcript_MAPK1()

        # Right in exon 1
        left_diff, right_diff = TranscriptProviderUtils.determine_closest_distance_from_exon(22162000, 22162005, 1,  tx)
        self.assertTrue(left_diff < 0 and right_diff > 0, "left distance should be negative while right distance should be positive.")
Example #48
0
    def variant_classify(self,
                         tx,
                         ref_allele,
                         alt_allele,
                         start,
                         end,
                         variant_type,
                         dist=2):
        """Perform classifications.

        Everything handled in genomic space

        *RNA*
        x'UTR
        Splice_Site (Intron)
        Intron
        Splice_Site (Exon)
        {Missense, Silent}
        {Nonsense, Silent}
        {Nonstop, Silent}
        IGR
        x'Flank
        De_novo_Start

        """
        gene_type = tx.get_gene_type()
        if gene_type != "protein_coding":
            if gene_type == VariantClassification.LINCRNA:
                return VariantClassification(VariantClassification.LINCRNA,
                                             variant_type,
                                             tx.get_transcript_id())
            else:
                return VariantClassification(VariantClassification.RNA,
                                             variant_type,
                                             tx.get_transcript_id())

        if ref_allele == "-":
            ref_allele = ""
        if alt_allele == "-":
            alt_allele = ""

        s = int(start)
        e = int(end)
        is_exon_overlap = TranscriptProviderUtils.determine_if_exon_overlap(
            s, e, tx, variant_type)

        is_splice_site_tuple = self._determine_if_splice_site_overlap(
            s, e, tx, variant_type, dist)
        is_splice_site = is_splice_site_tuple[0]

        is_beyond_exons, side, is_flank = self._determine_beyond_exon_info_vt(
            start, end, tx, variant_type)

        if not is_exon_overlap and not is_beyond_exons:
            exon_i = TranscriptProviderUtils.determine_closest_exon(
                tx, int(start), int(end))
            if is_splice_site:
                # Intron Splice Site
                return VariantClassification(
                    VariantClassification.SPLICE_SITE,
                    variant_type,
                    tx.get_transcript_id(),
                    vc_secondary=VariantClassification.INTRON,
                    exon_i=exon_i)
            else:
                return VariantClassification(VariantClassification.INTRON,
                                             variant_type,
                                             tx.get_transcript_id(),
                                             exon_i=exon_i)

        if not is_exon_overlap and is_beyond_exons:
            if is_flank:
                # Flanks
                if side.startswith("3"):
                    return VariantClassification(
                        VariantClassification.THREE_PRIME_PRIME_FLANK,
                        variant_type,
                        transcript_id=tx.get_transcript_id())
                else:
                    return VariantClassification(
                        VariantClassification.FIVE_PRIME_PRIME_FLANK,
                        variant_type,
                        transcript_id=tx.get_transcript_id())

            else:
                # IGR
                return VariantClassification(VariantClassification.IGR,
                                             variant_type)

        is_start_codon_overlap = self._determine_codon_overlap(
            s, e, tx.get_start_codon(), variant_type)
        is_stop_codon_overlap = self._determine_codon_overlap(
            s, e, tx.get_stop_codon(), variant_type)

        if is_start_codon_overlap and not variant_type.endswith("NP"):
            return VariantClassification('Start_Codon_' +
                                         variant_type.capitalize(),
                                         variant_type,
                                         transcript_id=tx.get_transcript_id())
        if is_stop_codon_overlap and not variant_type.endswith("NP"):
            return VariantClassification('Stop_Codon_' +
                                         variant_type.capitalize(),
                                         variant_type,
                                         transcript_id=tx.get_transcript_id())

        is_cds_overlap = self._determine_if_cds_overlap(s, e, tx, variant_type)
        if is_exon_overlap and not is_cds_overlap and not is_start_codon_overlap and not is_stop_codon_overlap:
            # UTR
            if side.startswith("3"):
                vc_tmp = VariantClassification.THREE_PRIME_UTR
            else:
                vc_tmp = VariantClassification.FIVE_PRIME_UTR
            transcript_position_exon_space_start, transcript_position_exon_space_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(
                start, end, tx)
            vc = self._determine_de_novo(vc_tmp,
                                         transcript_position_exon_space_start,
                                         ref_allele, alt_allele, tx,
                                         variant_type)
            return VariantClassification(
                vc,
                variant_type,
                transcript_id=tx.get_transcript_id(),
            )

        # We have a clean overlap in the CDS.  Includes start codon or stop codon.
        if is_cds_overlap or is_stop_codon_overlap or is_start_codon_overlap:
            is_frameshift_indel = self.is_frameshift_indel(
                variant_type, int(start), int(end), alt_allele)
            return self._determine_vc_for_cds_overlap(
                start, end, ref_allele, alt_allele, is_frameshift_indel,
                is_splice_site, tx, variant_type, is_start_codon_overlap)

        raise ValueError("Could not determine variant classification:  " +
                         tx.get_trancript_id() + " " +
                         str([ref_allele, alt_allele, start, end]))
 def test_render_protein_change(self, variant_type, variant_classification, secondary_vc, prot_position_start, prot_position_end, ref_prot_allele, alt_prot_allele, strand, gt):
     """Simple test of protein change, once parameters have been rendered. """
     guess = TranscriptProviderUtils.render_protein_change(variant_type, variant_classification, prot_position_start, prot_position_end, ref_prot_allele, alt_prot_allele, secondary_vc)
     self.assertTrue(guess == gt, "Incorrect guess gt <> guess: %s <> %s" % (gt, guess))
 def test_render_transcript_change(self, variant_type, vc, exon_position_start, exon_position_end, ref_allele_stranded, alt_allele_stranded, gt, secondary_vc):
     """Simple test of transcript change, once parameters have been rendered. """
     guess = TranscriptProviderUtils.render_transcript_change(variant_type, vc, exon_position_start, exon_position_end, ref_allele_stranded, alt_allele_stranded, secondary_vc)
     self.assertTrue(guess == gt, "Incorrect guess gt <> guess: %s <> %s" % (gt, guess))
Example #51
0
    def _determine_vc_for_cds_overlap(self, start, end, ref_allele, alt_allele,
                                      is_frameshift_indel, is_splice_site, tx,
                                      variant_type, is_start_codon):
        """
        Note: This method can also handle start and stop codons.

        :param start:
        :param end:
        :param ref_allele:
        :param alt_allele:
        :param is_frameshift_indel:
        :param is_splice_site:
        :param tx:
        :param variant_type:
        :return:
        """
        observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(
            ref_allele, alt_allele, tx)
        transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(
            start, end, tx)

        if tx.get_strand(
        ) == "+" and not variant_type == VariantClassification.VT_INS:
            transcript_position_start -= 1
            transcript_position_end -= 1

        transcript_seq = tx.get_seq()
        protein_seq = tx.get_protein_seq()
        cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(
            tx)
        protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions(
            transcript_position_start, transcript_position_end, cds_start)
        new_ref_transcript_seq = transcript_seq
        if (transcript_seq[transcript_position_start:transcript_position_end +
                           1] != reference_allele_stranded
            ) and variant_type != VariantClassification.VT_INS:
            new_ref_transcript_seq = list(transcript_seq)
            new_ref_transcript_seq[
                transcript_position_start:transcript_position_end +
                1] = reference_allele_stranded
            new_ref_transcript_seq = ''.join(new_ref_transcript_seq)
            ref_tx_seq_has_been_changed = True
        else:
            ref_tx_seq_has_been_changed = False
        cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(
            protein_position_start, protein_position_end, cds_start)

        if variant_type == "DEL":
            reference_codon_seq = new_ref_transcript_seq[
                cds_codon_start:cds_codon_end + 1].lower()
        else:
            reference_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(
                new_ref_transcript_seq[cds_codon_start:cds_codon_end +
                                       1].lower(), cds_codon_start,
                transcript_position_start, transcript_position_end,
                reference_allele_stranded, variant_type)

        if variant_type == "INS" and tx.get_strand() == "-":
            mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(
                reference_codon_seq.lower(), cds_codon_start - 1,
                transcript_position_start, transcript_position_end,
                observed_allele_stranded, variant_type)
        else:
            mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(
                reference_codon_seq.lower(), cds_codon_start,
                transcript_position_start, transcript_position_end,
                observed_allele_stranded, variant_type)

        observed_aa = MutUtils.translate_sequence(mutated_codon_seq)
        if ref_tx_seq_has_been_changed:
            reference_aa = MutUtils.translate_sequence(reference_codon_seq)
        else:
            reference_aa = protein_seq[protein_position_start -
                                       1:protein_position_end]

        if variant_type != VariantClassification.VT_SNP:

            try:
                reference_aa, observed_aa, protein_position_start, protein_position_end = \
                    self._adjust_protein_position_and_alleles(protein_seq, protein_position_start,
                        protein_position_end, reference_aa, observed_aa)
            except InvalidVariantException as ive:
                logging.getLogger(__name__).error(
                    "Could not properly adjust protein position for variant: %s, %s, %s, %s, %s VT: %s"
                    % (tx.get_contig(), start, end, ref_allele, alt_allele,
                       variant_type))
                logging.getLogger(__name__).error(str(ive))
                logging.getLogger(__name__).warn(
                    "Above error may not have exact start and end positions if this is a VCF input."
                )
                logging.getLogger(__name__).warn(
                    "Variant type is likely incorrect.  This can happen with some GATK VCFs"
                )
                logging.getLogger(__name__).warn(
                    TranscriptProviderUtils.is_valid_xNP(
                        variant_type, ref_allele, alt_allele))
                logging.getLogger(__name__).warn(
                    "The protein_change annotation may not be properly rendered."
                )

        vc_tmp, vc_tmp_secondary = self.infer_variant_classification(
            variant_type,
            reference_aa,
            observed_aa,
            ref_allele,
            alt_allele,
            is_frameshift_indel=is_frameshift_indel,
            is_splice_site=is_splice_site,
            is_start_codon=is_start_codon)

        cds_start_exon_space, cds_end_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(
            tx)
        exon_i = TranscriptProviderUtils.determine_exon_index(
            int(start), int(end), tx, variant_type)
        final_vc = VariantClassification(
            vc_tmp,
            variant_type,
            transcript_id=tx.get_transcript_id(),
            alt_codon=mutated_codon_seq,
            ref_codon=reference_codon_seq,
            ref_aa=reference_aa,
            ref_protein_start=protein_position_start,
            ref_protein_end=protein_position_end,
            alt_aa=observed_aa,
            alt_codon_start_in_exon=cds_codon_start,
            alt_codon_end_in_exon=cds_codon_end,
            ref_codon_start_in_exon=cds_codon_start,
            ref_codon_end_in_exon=cds_codon_end,
            cds_start_in_exon_space=cds_start_exon_space,
            ref_allele_stranded=reference_allele_stranded,
            alt_allele_stranded=observed_allele_stranded,
            exon_i=exon_i,
            vc_secondary=vc_tmp_secondary)
        return final_vc
    def test_transform_to_feature_space(self, exons, s, gt, strand):
        """Run some basic tests transforming genomic coordinates to exon coordinates, taking strand into account. """

        guess = TranscriptProviderUtils._transform_to_feature_space(exons, s, strand)
        self.assertTrue(guess == gt, "Did not transform genomic to exon space properly: " + str(exons) +  "   pos: " + str(s) + "  strand: " + strand + "  guess/gt: " + str(guess) + "/" + str(gt))