Exemple #1
0
    def test_heterozygous_variants(self):
        """
        Create multiple transcript variants for a transcript, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTCGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTCCGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)
        """

        dummy_db = DummyAdapter()

        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        trans_gener = generate_transcripts_from_variants(dummy_vars, dummy_db)
        trans = [t for t in trans_gener]

        print trans

        trans = map(str, trans)

        self.assertEqual(len(trans), 8)

        self.assertTrue("AAATTTCCGGGG" in trans)
        self.assertTrue("AAATTTCCCCCGGGGG" in trans)
        self.assertTrue("AAAAATTTCCGGGG" in trans)
        self.assertTrue("AAAAATTTCCCCCGGGGG" in trans)

        self.assertTrue("GGGTTTCCAAAA" in trans)
        self.assertTrue("GGGTTTCCAAAA" in trans)
        self.assertTrue("GGGTTTCCCCCAAAAA" in trans)
        self.assertTrue("GGGGGTTTCCCCCAAAAA" in trans)
Exemple #2
0
    def test3_protein_from_variants(self):
        """
        Generate some transcripts from the 3 input variants
        (should give 8 transcripts, check also if all fields are complete)
        Using a protein made from variants:

        Translate to proteins (check if all fields are there/filled)

        fragment to unique peptides
        (check for uniqueness of sequences, check fields of peptides, check
        correctness of fragments)
        """
        dummy_db = DummyAdapter()
        dummy_vars = [var_10, var_11, var_12]

        proteins = []
        t = list(generate_transcripts_from_variants(dummy_vars, dummy_db))
        for trans in t:
            # check gene id field:
            print trans
            self.assertEqual(trans.gene_id, "gene_1")

            # check trans id name:
            name = trans.transcript_id.split(":FRED2_")
            self.assertEqual(len(name), 2)
            self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2")
            self.assertTrue(len(name[1]) == 1 and name[1].isdigit)

            # check var:
            self.assertIsNotNone(trans.vars)
            self.assertTrue(len(trans.vars) > 0)

            # check sequence:
            self.assertTrue(str(trans) > 5)

            ### GET PROTS:
            # IGNORE invalid sequence lengths
            try:
                proteins.append(generate_proteins_from_transcripts(trans).next())
            except ValueError:
                pass

        self.assertEqual(len(proteins), 8)

        ## CHECK Proteins:
        for prot in proteins:
            self.assertEqual(prot.gene_id, "gene_1")

            # check trans id name:
            name = prot.transcript_id.split(":FRED2_")
            self.assertEqual(len(name), 2)
            self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2")
            self.assertTrue(len(name[1]) == 1 and name[1].isdigit)

            orig = prot.orig_transcript
            self.assertEqual(prot.transcript_id, orig.transcript_id)
            self.assertEqual(len(set(e for subl in prot.vars.itervalues() for e in subl)), len(orig.vars))

            # check sequence:
            self.assertTrue(str(prot) > 2)

        ## GENERATE Peptides:
        peptides = generate_peptides_from_protein(proteins,2)
Exemple #3
0
    def test4_peptides_from_variants(self):
        """
        Ref trancript: AAAAACCCCCGGGGG
        ref protein:   KNPRG
        ref peps(3):   KNPR, NPRG

        variant1: heterozygous, fs+1 in first aa
        variant2: heterozygous, insertion +2 in last aa

        trans-var1: TKPPGA
        1: peps(3): TKPP, KPPG, PPGA

        trans-var2: KNPRG
        2: peps(3): KNPR, NPRG

        Output:
        -------
        PEPTIDE: PPGA
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(15CC)
                 Variant(1C)
        PEPTIDE: KPPG
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(1C)
        PEPTIDE: TKPP
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(1C)

        PEPTIDE: KNPR
            TRANSCRIPT: tsc_1:FRED2_0
        PEPTIDE: NPRG
            TRANSCRIPT: tsc_1:FRED2_0
        """
        #TODO Somewhere here a print statement is called
        peps_trans1 = ["KNPR", "NPRG"]
        peps_trans2 = ["PPGA", "KPPG", "TKPP"]
        expected_vars = ["Variant(1C)", "Variant(15CC)"]
        expected = peps_trans1 + peps_trans2

        dummy_db = DummyAdapter()
        dummy_vars = [var_13, var_14]

        proteins = []
        transcripts = list(generate_transcripts_from_variants(dummy_vars, dummy_db))
        for trans in transcripts:
            ### GET PROTS:
            # IGNORE invalid sequence lengths
            try:
                proteins.append(generate_proteins_from_transcripts(trans).next())
            except ValueError:
                pass

        peptides = generate_peptides_from_protein(proteins, 4)

        sequences = [str(pep) for pep in peptides]

        # Check if all peptides are generated as expected
        self.assertTrue(all(pep in sequences for pep in expected))
        # no duplicates or more than the expected ones:
        self.assertEqual(len(peptides), len(expected))

        #vari_peps = [pep.get_all_variants() for pep in peptides \
        #             if str(pep) in peps_trans2]

        #vars_ = [str(var) for varlist in vari_peps for var in varlist]

        # Check that for the peptides from the transcript containing the
        # variants, we also get all expected variants. Especally the first
        # variant needs to be present in all peptides
        for prot in proteins:
            for p in peptides:
                try:
                    vars_ = map(str, p.get_variants_by_protein(prot.transcript_id))
                    expected_vars = [str(v) for vars in prot.vars.itervalues() for v in vars]
                    print "peptide vars: ", vars_
                    print "Prot vars: ", expected_vars
                    print repr(p)
                    print repr(prot)
                    self.assertTrue(all(var in expected_vars for var in vars_))
                except ValueError:

                    pass
Exemple #4
0
    def test3_protein_from_variants(self):
        """
        Generate some transcripts from the 3 input variants
        (should give 8 transcripts, check also if all fields are complete)
        Using a protein made from variants:

        Translate to proteins (check if all fields are there/filled)

        fragment to unique peptides
        (check for uniqueness of sequences, check fields of peptides, check
        correctness of fragments)
        """
        dummy_db = DummyAdapter()
        dummy_vars = [var_10, var_11, var_12]

        proteins = []
        t = list(generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ))
        for trans in t:
            # check gene id field:
            print trans
            self.assertEqual(trans.gene_id, "gene_1")

            # check trans id name:
            name = trans.transcript_id.split(":FRED2_")
            self.assertEqual(len(name), 2)
            self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2")
            self.assertTrue(len(name[1]) == 1 and name[1].isdigit)

            # check var:
            self.assertIsNotNone(trans.vars)
            self.assertTrue(len(trans.vars) > 0)

            # check sequence:
            self.assertTrue(str(trans) > 5)

            ### GET PROTS:
            # IGNORE invalid sequence lengths
            try:
                proteins.append(generate_proteins_from_transcripts(trans).next())
            except ValueError:
                pass

        self.assertEqual(len(proteins), 8)

        ## CHECK Proteins:
        for prot in proteins:
            self.assertEqual(prot.gene_id, "gene_1")

            # check trans id name:
            name = prot.transcript_id.split(":FRED2_")
            self.assertEqual(len(name), 2)
            self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2")
            self.assertTrue(len(name[1]) == 1 and name[1].isdigit)

            orig = prot.orig_transcript
            self.assertEqual(prot.transcript_id, orig.transcript_id)
            self.assertEqual(len(set(e for subl in prot.vars.itervalues() for e in subl)), len(orig.vars))

            # check sequence:
            self.assertTrue(str(prot) > 2)

        ## GENERATE Peptides:
        peptides = generate_peptides_from_proteins(proteins,2)
Exemple #5
0
    def test4_peptides_from_variants(self):
        """
        Ref trancript: AAAAACCCCCGGGGG
        ref protein:   KNPRG
        ref peps(3):   KNPR, NPRG

        variant1: heterozygous, fs+1 in first aa
        variant2: heterozygous, insertion +2 in last aa

        trans-var1: TKPPGA
        1: peps(3): TKPP, KPPG, PPGA

        trans-var2: KNPRG
        2: peps(3): KNPR, NPRG

        Output:
        -------
        PEPTIDE: PPGA
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(15CC)
                 Variant(1C)
        PEPTIDE: KPPG
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(1C)
        PEPTIDE: TKPP
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(1C)

        PEPTIDE: KNPR
            TRANSCRIPT: tsc_1:FRED2_0
        PEPTIDE: NPRG
            TRANSCRIPT: tsc_1:FRED2_0
        """
        #TODO Somewhere here a print statement is called
        peps_trans1 = ["KNPR", "NPRG"]
        peps_trans2 = ["PPGA", "KPPG", "TKPP"]
        expected_vars = ["Variant(1C)", "Variant(15CC)"]
        expected = peps_trans1 + peps_trans2

        dummy_db = DummyAdapter()
        dummy_vars = [var_13, var_14]

        proteins = []
        transcripts = list(generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ))
        for trans in transcripts:
            ### GET PROTS:
            # IGNORE invalid sequence lengths
            try:
                proteins.append(generate_proteins_from_transcripts(trans).next())
            except ValueError:
                pass

        peptides = list(generate_peptides_from_proteins(proteins, 4))

        sequences = [str(pep) for pep in peptides]

        # Check if all peptides are generated as expected
        self.assertTrue(all(pep in sequences for pep in expected))
        # no duplicates or more than the expected ones:
        self.assertEqual(len(peptides), len(expected))

        #vari_peps = [pep.get_all_variants() for pep in peptides \
        #             if str(pep) in peps_trans2]

        #vars_ = [str(var) for varlist in vari_peps for var in varlist]

        # Check that for the peptides from the transcript containing the
        # variants, we also get all expected variants. Especally the first
        # variant needs to be present in all peptides
        for prot in proteins:
            for p in peptides:
                try:
                    vars_ = map(str, p.get_variants_by_protein(prot.transcript_id))
                    expected_vars = [str(v) for vars in prot.vars.itervalues() for v in vars]
                    print "peptide vars: ", vars_
                    print "Prot vars: ", expected_vars
                    print repr(p)
                    print repr(prot)
                    self.assertTrue(all(var in expected_vars for var in vars_))
                except KeyError:
                    pass