コード例 #1
0
def test_random_sequences(aligner, strand1="+", strand2="+"):
    chromosome = "".join(["ACGT"[random.randint(0, 3)] for i in range(1000)])
    transcript = "".join(["ACGT"[random.randint(0, 3)] for i in range(300)])
    sequence = "".join(["ACGT"[random.randint(0, 3)] for i in range(100)])
    chromosome = Seq(chromosome)
    transcript = Seq(transcript)
    sequence = Seq(sequence)
    chromosome.id = "chromosome"
    transcript.id = "transcript"
    sequence.id = "sequence"
    alignments = aligner.align(chromosome, transcript, strand=strand1)
    alignment1 = alignments[0]
    alignments = aligner.align(transcript, sequence, strand=strand2)
    alignment2 = alignments[0]
    psl_check = map_check(alignment1, alignment2)
    alignment = alignment1.map(alignment2)
    psl_check = psl_check.split()
    psl = format(alignment, "psl")
    psl = psl.split()
    assert psl[8:] == psl_check[8:]
    psl1 = format(alignment1, "psl")
    words = psl1.split()
    nBlocks1 = int(words[17])
    psl2 = format(alignment2, "psl")
    words = psl2.split()
    nBlocks2 = int(words[17])
    print("Randomized sequence test %d, %d, %s, %s OK" %
          (nBlocks1, nBlocks2, strand1, strand2))
コード例 #2
0
    def test_internal(self):
        aligner = self.aligner
        chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA")
        chromosome.id = "chromosome"
        transcript = Seq("GGGGGGGCCCCCGGGGGGA")
        transcript.id = "transcript"
        sequence = Seq("GGCCCCCGGG")
        sequence.id = "sequence"
        alignments1 = aligner.align(chromosome, transcript)
        self.assertEqual(len(alignments1), 1)
        alignment1 = alignments1[0]
        self.assertTrue(
            numpy.array_equal(alignment1.coordinates,
                              numpy.array([[12, 31], [0, 19]])))
        self.assertEqual(
            str(alignment1),
            """\
AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA
            |||||||||||||||||||
            GGGGGGGCCCCCGGGGGGA
""",
        )
        alignments2 = aligner.align(transcript, sequence)
        self.assertEqual(len(alignments2), 1)
        alignment2 = alignments2[0]
        self.assertTrue(
            numpy.array_equal(alignment2.coordinates,
                              numpy.array([[5, 15], [0, 10]])))
        self.assertEqual(
            str(alignment2),
            """\
GGGGGGGCCCCCGGGGGGA
     ||||||||||
     GGCCCCCGGG
""",
        )
        alignment = alignment1.map(alignment2)
        self.assertTrue(
            numpy.array_equal(alignment.coordinates,
                              numpy.array([[17, 27], [0, 10]])))
        self.assertEqual(
            str(alignment),
            """\
AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA
                 ||||||||||
                 GGCCCCCGGG
""",
        )
        psl = format(alignment, "psl")
        self.assertEqual(
            psl,
            """\
10	0	0	0	0	0	0	0	+	sequence	10	0	10	chromosome	40	17	27	1	10,	0,	17,
""",
        )
コード例 #3
0
    def test1(self):
        aligner = self.aligner
        chromosome = Seq(
            "GCCTACCGTATAACAATGGTTATAATACAAGGCGGTCATAATTAAAGGGAGTGCAGCAACGGCCTGCTCTCCAAAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTCTGCCAGTCACCGGCATACGTCCTGGGACAAAGACTTTTTACTACAATGCCAGGCGGGAGAGTCACCCGCCGCGGTGTCGACCCAGGGGACAGCGGGAAGATGTCGTGGTTTCCTTGTCATTAACCAACTCCATCTTAAAAGCTCCTCTAGCCATGGCATGGTACGTTGCGCGCACCCTTTTATCGGTAAGGCGCGGTGACTCTCTCCCAAAACAGTGCCATAATGGTTCGCTTCCTACCTAAGGCACTTACGGCCAATTAATGCGCAAGCGAGCGGAAGGTCTAACAGGGCACCGAATTCGATTA"
        )
        chromosome.id = "chromosome"
        transcript = Seq(
            "GGAATTTTAGCAGCCAAAGGACGGATCCTCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGAACTAAGCGGGAGCGCATGTGGGACAGTTGATCCCATCCGCCTCAAAATTTCTCGCAATATCGGTTGGGGCACAGGTCCACTTTACGAATTCATACCGTGGTAGAGACCTTTATTAGATAGATATGACTGTTTGATTGCGGCATAGTACGACGAAGCAAGGGGATGGACGTTTCGGTTGCATTCGACCGGGTTGGGTCGAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTC"
        )
        transcript.id = "transcript"
        sequence = Seq(
            "TCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGGGGACAGTTGATCCCATCCGCCTTTTACGAATTCATACCGTGGTAGGCGGCATAGTACGACGAAGCGGTTGGGTCGAAAAACAGGTTGCCGTCATATCGGTGGGTTC"
        )
        sequence.id = "sequence"
        alignments1 = aligner.align(chromosome, transcript)
        alignment1 = alignments1[0]
        self.assertEqual(len(alignment1.path), 164)
        self.assertEqual(
            str(alignment1),
            """\
GCCTACCGTATAACAATGGTTATA------ATACAAGG-CGG----TCATAATTAAAGGGAGTG---CAGCAACGGCCTGCTCTCCAAAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGC-----CGTCATATCGGTGG----GTTCTGCCAGTCACCGGCATACGTCCTGGGACAAAGACTTTTTACT-ACAATGCCAGGCGGGAGAGTCACCCGCCGCGGTGTCGACCCAGGGG-ACAGCGGGAAGATGTCGTGGTTTC-CTT---G---TCATTAACC-------A-ACTCCATCTTA--AAAGCTCCTCTAGCCATGGCATG---GT---ACGTTGCGCGCACCCTTTTA-T----CG--GTAAGG-------CG---CGGT-------GACTCTC--------TCCCAAAACAGTGCCATAATGGTTCGCTTCCTACCT-------AAG-GCACTT-ACGGCCAATTAATGCGCAAGCGAGCGGAAGGTC-TAACAG-GGCACCGAATTCGATTA
              |||--||-||------|---||||-|||----||------.|||||---|---|||||-----|.||-----------|||--||||-|------||.|.|----||||----||||-----||-|||----||||----||--||--|-|--||--|||.||-|||----||||-|---|||-||-.||||------------|-|---------||||-|-------||||-||||---------|||-------|-|||---|---||||--|||-------|-|--||-|-|||--|.|-------|||--||---|||---||---|--|||||-|||------||-|----||--|.||||-------||---||||-------|||---|--------||..||||||----------|||----||--||--|-------|||-|||-||-||.|----|||------||||---|-----|||-||.|.|-||----|--|||     
            GGAAT--TT-TAGCAGCCA---AAGGACGGATCCTC------CAAGGG---GCCCCAGCA-----CAGC-----------ACA--TTTT-T------AACGCG----AACT----AAGCGGGAGCG-CAT----GTGGGACAGT--TG--A-T--CC--CATCCG-CCT----CAAA-A---TTT-CTCGCAAT------------A-T---------CGGT-T-------GGGGCACAG---------GTC-------CACTTTACGAATTCAT--ACCGTGGTAGAGA--CC-T-TTATTAGA-------TAG--AT---ATGACTGTTTGA--TTGCG-GCA------TAGTACGACGAAGCAAGGGGATGGACGTTTCGGTTGCATTCGAC---CGGGTTGGGTCGAAAAACA----------GGT----TT--TA--TGAAAAGAAAGTGCA-TTAACTG----TTA------AAGC---C-----GTCATATCGGTGG----G--TTC     
""",  # noqa: W291
        )
        alignments2 = aligner.align(transcript, sequence)
        alignment2 = alignments2[0]
        self.assertEqual(len(alignment2.path), 12)
        self.assertEqual(
            str(alignment2),
            """\
GGAATTTTAGCAGCCAAAGGACGGATCCTCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGAACTAAGCGGGAGCGCATGTGGGACAGTTGATCCCATCCGCCTCAAAATTTCTCGCAATATCGGTTGGGGCACAGGTCCACTTTACGAATTCATACCGTGGTAGAGACCTTTATTAGATAGATATGACTGTTTGATTGCGGCATAGTACGACGAAGCAAGGGGATGGACGTTTCGGTTGCATTCGACCGGGTTGGGTCGAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTC
                            |||||||||||||||||||||||||||||||||||--------------------|||||||||||||||||||||||--------------------------------------|||||||||||||||||||||||---------------------------------||||||||||||||||||||--------------------------------|||||||||||||||||||||------------------------------||||||||||||||||||||
                            TCCAAGGGGCCCCAGCACAGCACATTTTTAACGCG--------------------GGGACAGTTGATCCCATCCGCCT--------------------------------------TTTACGAATTCATACCGTGGTAG---------------------------------GCGGCATAGTACGACGAAGC--------------------------------GGTTGGGTCGAAAAACAGGTT------------------------------GCCGTCATATCGGTGGGTTC
""",
        )
        alignment = alignment1.map(alignment2)
        self.assertEqual(len(alignment.path), 76)
        self.assertEqual(
            str(alignment),
            """\
GCCTACCGTATAACAATGGTTATAATACAAGGCGGTCATAATTAAAGGGAGTG---CAGCAACGGCCTGCTCTCCAAAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGG----GTTCTGCCAGTCACCGGCATACGTCCTGGGACAAAGACTTTTTACTACAATGCCAGGCGGGAGAGTCACCCGCCGCGGTGTCGACCCAGGGGACAGCGGGAAGATGTCGTGGTTTCCTT---G---TCATTAACCAACTCCATCTTAAAAGCTCCTCTAGCCATGGCATGGTACGTT-------GCGCGCACCCTTTTA-T----CG--GTAAGGCGCGGTGACTCTC-------TCCCAAAACAGTGCCATAATGGTTCGCTTCCTACCTAAGGCACTTACGGCCAATTAATGCGCAAGCGAGCGGAAGGTC-TAACAG-GGCACCGAATTCGATTA
                                   ||------.|||||---|---|||||-----|.||-----------|||--||||-|------||.|.|----------------------------||----||--||--|-|--||--|||.||-|||------------------------------------------------------------------------------------------||---|---||||--|||-------------------------------------------------|||-|||------||-|----||--|.------------------------||..||||||----------|||----|------------------------------------||---|-----|||-||.|.|-||----|--|||     
                                   TC------CAAGGG---GCCCCAGCA-----CAGC-----------ACA--TTTT-T------AACGCG----------------------------GGGACAGT--TG--A-T--CC--CATCCG-CCT------------------------------------------------------------------------------------------TTTACGAATTCAT--ACC------------------------------------------GTGGTAGGCG-GCA------TAGTACGACGAAGC-----------------GGTTGGGTCGAAAAACA----------GGT----T------------------------------------GC---C-----GTCATATCGGTGG----G--TTC     
""",  # noqa: W291
        )
        psl = format(alignment, "psl")
        self.assertEqual(
            psl,
            """\
96	10	0	0	11	36	27	294	+	sequence	142	0	142	chromosome	440	35	435	37	2,6,1,5,4,3,4,1,6,2,2,2,1,1,2,6,3,2,1,4,3,3,3,2,1,2,2,10,3,1,2,1,3,6,2,1,3,	0,2,8,12,17,21,24,28,29,35,41,43,45,46,47,49,55,58,63,67,71,81,84,87,90,95,99,108,118,121,122,124,125,129,136,138,139,	35,43,52,53,63,78,83,88,95,129,131,135,139,141,144,148,155,248,250,251,257,302,306,315,317,318,320,339,359,366,403,408,414,417,423,429,432,
""",
        )
コード例 #4
0
    def test2(self):
        aligner = self.aligner
        chromosome = Seq(
            "CTAATGCGCCTTGGTTTTGGCTTAACTAGAAGCAACCTGTAAGATTGCCAATTCTTCAGTCGAAGTAAATCTTCAATGTTTTGGACTCTTAGCGGATATGCGGCTGAGAAGTACGACATGTGTACATTCATACCTGCGTGACGGTCAGCCTCCCCCGGGACCTCATTGGGCGAATCTAGGTGTGATAATTGACACACTCTTGGTAAGAAGCACTCTTTACCCGATCTCCAAGTACCGACGCCAAGGCCAAGCTCTGCGATCTAAAGCTGCCGATCGTAGATCCAAGTCCTCAGCAAGCTCGCACGAATACGCAGTTCGAAGGCTGGGTGTTGTACGACGGTACGGTTGCTATAGCACTTTCGCGGTCTCGCTATTTTCAGTTTGACTCACCAGTCAGTATTGTCATCGACCAACTTGGAATAGTGTAACGCAGCGCTTGA"
        )
        chromosome.id = "chromosome"
        transcript = Seq(
            "CACCGGCGTCGGTACCAGAGGGCGTGAGTACCTTGTACTAGTACTCATTGGAATAATGCTCTTAGAAGTCATCTAAAAGTGACAACGCCTGTTTGGTTATGACGTTCACGACGCGTCTTAACAGACTAGCATTAGACCGACGGGTTGAGGCGTCTGGGTTGATACAGCCGTTTGCATCAGTGTATCTAACACTCTGAGGGATAATTGATGAACCGTGTTTTCCGATAGGTATGTACAGTACCACCACGCACGACTAAGGACCATTTTCTGCGTGCGACGGTTAAAATAACCTCAATCACT"
        )
        transcript.id = "transcript"
        sequence = Seq(
            "TCCCCTTCTAATGGAATCCCCCTCCGAAGGTCGCAGAAGCGGCCACGCCGGAGATACCAGTTCCACGCCTCAGGTTGGACTTGTCACACTTGTACGCGAT"
        )
        sequence.id = "sequence"
        alignments1 = aligner.align(chromosome, transcript)
        alignment1 = alignments1[0]
        self.assertEqual(len(alignment1.path), 126)
        self.assertEqual(
            str(alignment1),
            """\
CTAATGCGCCTTGGTTTTGGCTTAACTAGA-------AGCAACC-TGTAAGATTGCCAATTCTTCAGTCGAAGTAAATCTTCAATGTTTTGGA------CTCTTAG----CGGATATGCGGCTGAGAAGTACGACA-----TGT---GT----ACATTCATAC--CTGCGT-------GACGGTCAGCCT----CCCCCGGGACCTCATTG-GGCGAATCTAGGTGTGATA-A-----TTGACA-CA----CTCTTGGTAAGAAGCACTCT---------TTACCCGATCTCCAAGTACCGACGCCAAGGCCAAGCTCTG-----CGATCTAAAGCTGCCGATCGTAGATCCAAGTCCTCAGCAAGCTCGCACGAATACGCAG-------TTCGAAGGCTGGGTGTTGTACGACGGTACGGTTGCTATAGCACTTTCGCGGTCTCGCTATTTTCAGTTTGACTCACCAGTCAGTATTGTCATCGACCAACTTGGAATAGTGTAACGCAGCGCTTGA
     |||--|.|||---------||.|||-------||-.|||-||||------------||--------||||---|-|||-----|||||------|||||||----|----||----||-|.||||--||||-----|||---||----||.|||--||--|-||||-------|||--|-|||.|----||..||||------|||-||||--|||.|||-|||||-|-----|||-||-||----.|||-------||-||||||---------||----|||------|.||||-----------------||-----||||----||.|----||-|||----|-|||.|-||.||----|||||||.||---||-------||------|||.|||-----|||||||--------||-|--|----------------||----------|---|||--|||--------|||-----|||                         
CACCGGCG--TCGGT---------ACCAGAGGGCGTGAG-TACCTTGTA------------CT--------AGTA---C-TCA-----TTGGAATAATGCTCTTAGAAGTC----AT----CT-AAAAGT--GACAACGCCTGTTTGGTTATGACGTTC--ACGAC-GCGTCTTAACAGAC--T-AGCATTAGACCGACGGG------TTGAGGCG--TCTGGGT-TGATACAGCCGTTTG-CATCAGTGTATCT-------AA-CACTCTGAGGGATAATT----GAT------GAACCG-----------------TGTTTTCCGAT----AGGT----AT-GTA----C-AGTAC-CACCA----CGCACGACTA---AGGACCATTTT------CTGCGTG-----CGACGGT--------TA-A--A----------------AT----------A---ACC--TCA--------ATC-----ACT                         
""",  # noqa: W291
        )
        alignments2 = aligner.align(transcript, sequence)
        alignment2 = alignments2[0]
        self.assertEqual(len(alignment2.path), 66)
        self.assertEqual(
            str(alignment2),
            """\
CACCGGCGTCGGTACCAGAGGGCGTGAGTACCTTGTACTAGTACTCATTGGAATAATGCTCTTAGAAGTCATCTAAAAGTGACAACGCCTGTTTGGTTATGACGTTCACGACGCGTCTTAACAGACTAGCATTAGACCGACG--GGTTGAGGCGTCTGGGTTGATACAGCCGTTTGCATCAGTGTATCTAACA---CTCTGAGGGATAATTGATGAACCGTGTTTTCCGATAGGTATGTACAGTACCACCACGCACGACTAAGGACCATTTTCTG--CGTGCGACGGTTAAAATAACCTCAATCACT
        ||------------|-------||||---|||------|-||||||-------------------------------|.||-------------|--||-|||.|-|||---.||||--|||----|.||-|||--|---||------------|||||------------||||---||---||---|||--|||-----|||--||--|-----||----------------||---------|||-|||-------------||--|--||||                       
        TC------------C-------CCTT---CTA------A-TGGAAT-------------------------------CCCC-------------C--TC-CGAAG-GTC---GCAGA--AGC----GGCC-ACGCCG---GA------------GATAC------------CAGT---TC---CACGCCTC--AGG-----TTG--GA--C-----TT----------------GT---------CAC-ACT-------------TGTAC--GCGAT                      
""",  # noqa: W291
        )
        alignment = alignment1.map(alignment2)
        self.assertEqual(len(alignment.path), 78)
        self.assertEqual(
            str(alignment),
            """\
CTAATGCGCCTTGGTTTTGGCTTAACTAGAAGCAA-CC-TGTAAGATTGCCAATTCTTCAGTCGAAGTAAATCTTCAATGTTTTGGACTCTTAGCGGATATGCGGCTGAGAAGTACGACATGTGTA------CATTCATAC--CTGCGT----GACGGTCAGCCT--CCCCCG--GGACCTCATTGGGCGAATCTAGGTGT-GATAATTGACA-CAC--TCTTGGTAAGAAGCA---CTCT---TTACCCGATCTCCAAGTACCGACGCCAAGGCCAAGCTCTGCGATCTAAAGCTGCCGATCGTAGATCCAA--GTCCTCAGCAAGCTCGCACGAATACGCAGTTCGAAGGCTG--GGTGTTGTACGACGGTACGGTTGCTATAGCACTTTCGCGGTCTCGCTATTTTCAGTTTGACTCACCAGTCAGTATTGTCATCGACCAACTTGGAATAGTGTAACGCAGCGCTTGA
          |.------------------------||-|---------------||--------|----------|------||||---------------------------------------------|--||---|--.-|-||----||-----|||----||-.||--|---------|----------------||||--------||---||-----------||---|||----||----|--------|.--|---------------------------------------------------||--------------|||-|.|---------------||--.--|-----|||                                                                                                       
          TC-----------------------CCCTT---------------CT--------A----------A------TGGA---------------------------------------ATCCCCC--TC---CGAA-G-GTCGCAGA-----AGC--GGCC-ACGCCG---------G---------------AGATA-------CCA-GTTC-----------CACGCCTC-AGGTT----G--------GA--C-------------------------------------------------TTGT--------------CAC-ACT---------------TGTAC--G-----CGAT                                                                                                      
""",  # noqa: W291
        )
        psl = format(alignment, "psl")
        self.assertEqual(
            psl,
            """\
61	6	0	0	14	32	28	260	+	sequence	100	0	99	chromosome	440	10	337	35	2,2,1,2,1,1,4,1,2,1,1,1,2,2,3,2,3,1,1,4,2,2,2,3,2,1,2,1,2,3,3,2,1,1,3,	0,3,6,7,9,10,11,21,22,24,27,28,29,35,37,42,44,49,50,52,57,61,63,68,74,76,77,79,82,84,87,90,94,95,96,	10,35,37,53,63,74,81,124,127,132,133,135,137,139,146,151,154,157,167,183,194,197,210,212,216,222,231,235,285,301,305,323,325,328,334,
""",
        )
コード例 #5
0
def load_csv_file(file, delimiter=";"):
    """
    This function loads a "Primer" file.
    @returns: List of PrimerPair instances
    """
    pos = {
        "id": 0,
        "forwardPrimer": 0,
        "reversePrimer": 0,
        "fPDNA": 0,
        "rPDNA": 0,
        "ampliconMinLength": 0,
        "ampliconMaxLength": 0
    }
    header_len = len(pos)
    primer_dict = {}
    with open(file, newline='') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=delimiter)
        headers = next(csvreader)
        if (len(headers) != header_len):
            raise ValueError("Wrong header")
        for i in range(len(headers)):
            if (headers[i] not in pos):
                raise ValueError("Unknown header " + headers[i])
            pos[headers[i]] = i

        i = 1
        for row in csvreader:
            i += 1
            if (len(row) == header_len):
                fprimer = Seq(row[pos["fPDNA"]], IUPAC.IUPACAmbiguousDNA())
                fprimer = SeqRecord(fprimer)
                fprimer.id = row[pos["forwardPrimer"]]

                rprimer = Seq(row[pos["rPDNA"]], IUPAC.IUPACAmbiguousDNA())
                rprimer = SeqRecord(rprimer)
                if (True):  #TODO
                    rprimer = rprimer.reverse_complement()
                rprimer.id = row[pos["reversePrimer"]]

                primer_pair = PrimerPair((row[pos["id"]]), fprimer, rprimer,
                                         int(row[pos["ampliconMinLength"]]),
                                         int(row[pos["ampliconMaxLength"]]))
                if (check_primer_pair_integrity(primer_pair)):
                    primer_dict[row[pos["id"]]] = primer_pair
                else:
                    logging.warning("Skipping primer pair " + primer_pair.id +
                                    ", bad sequence")
            else:
                logging.warning("Wrong primer pair in line " + str(i))

    return primer_dict
コード例 #6
0
    def test_internal(self):
        aligner = self.aligner
        chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA")
        chromosome.id = "chromosome"
        transcript = Seq("GGGGGGGCCCCCGGGGGGA")
        transcript.id = "transcript"
        sequence = Seq("GGCCCCCGGG")
        sequence.id = "sequence"
        alignments1 = aligner.align(chromosome, transcript)
        self.assertEqual(len(alignments1), 1)
        alignment1 = alignments1[0]
        self.assertEqual(alignment1.path, ((12, 0), (31, 19)))
        self.assertEqual(
            str(alignment1),
            """\
AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA
            |||||||||||||||||||
            GGGGGGGCCCCCGGGGGGA
""",
        )
        alignments2 = aligner.align(transcript, sequence)
        self.assertEqual(len(alignments2), 1)
        alignment2 = alignments2[0]
        self.assertEqual(alignment2.path, ((5, 0), (15, 10)))
        self.assertEqual(
            str(alignment2),
            """\
GGGGGGGCCCCCGGGGGGA
     ||||||||||
     GGCCCCCGGG
""",
        )
        alignment = alignment1.map(alignment2)
        self.assertEqual(len(alignment.path), 2)
        self.assertSequenceEqual(alignment.path[0], [17, 0])
        self.assertSequenceEqual(alignment.path[1], [27, 10])
        self.assertEqual(
            str(alignment),
            """\
AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA
                 ||||||||||
                 GGCCCCCGGG
""",
        )
        psl = format(alignment, "psl")
        self.assertEqual(
            psl,
            """\
10	0	0	0	0	0	0	0	+	sequence	10	0	10	chromosome	40	17	27	1	10,	0,	17,
""",
        )
コード例 #7
0
    def test_reverse_transcript_sequence(self):
        aligner = self.aligner
        chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA")
        chromosome.id = "chromosome"
        transcript = Seq("TCCCCCCGGGGGCCCCCCC")
        transcript.id = "transcript"
        sequence = Seq("CCCGGGGGCC")
        sequence.id = "sequence"
        alignments1 = aligner.align(chromosome, transcript, "-")
        self.assertEqual(len(alignments1), 1)
        alignment1 = alignments1[0]
        self.assertEqual(alignment1.path, ((12, 19), (31, 0)))
        self.assertEqual(
            str(alignment1),
            """\
AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA
            |||||||||||||||||||         
            GGGGGGGCCCCCGGGGGGA         
""",  # noqa: W291
        )
        alignments2 = aligner.align(transcript, sequence)
        self.assertEqual(len(alignments2), 1)
        alignment2 = alignments2[0]
        self.assertEqual(alignment2.path, ((4, 0), (14, 10)))
        self.assertEqual(
            str(alignment2),
            """\
TCCCCCCGGGGGCCCCCCC
    ||||||||||     
    CCCGGGGGCC     
""",  # noqa: W291
        )
        alignment = alignment1.map(alignment2)
        self.assertEqual(len(alignment.path), 2)
        self.assertSequenceEqual(alignment.path[0], [17, 10])
        self.assertSequenceEqual(alignment.path[1], [27, 0])
        self.assertEqual(
            str(alignment),
            """\
AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA
                 ||||||||||             
                 GGCCCCCGGG             
""",  # noqa: W291
        )
        psl = format(alignment, "psl")
        self.assertEqual(
            psl,
            """\
10	0	0	0	0	0	0	0	-	sequence	10	0	10	chromosome	40	17	27	1	10,	0,	17,
""",
        )
コード例 #8
0
    def test_left_overhang(self):
        aligner = self.aligner
        chromosome = Seq("GGGCCCCCGGGGGGAAAAAAAAAA")
        chromosome.id = "chromosome"
        transcript = Seq("AGGGGGCCCCCGGGGGGA")
        transcript.id = "transcript"
        sequence = Seq("GGGGGCCCCCGGG")
        sequence.id = "sequence"
        alignments1 = aligner.align(chromosome, transcript)
        self.assertEqual(len(alignments1), 1)
        alignment1 = alignments1[0]
        self.assertEqual(
            str(alignment1),
            """\
   GGGCCCCCGGGGGGAAAAAAAAAA
   |||||||||||||||
AGGGGGCCCCCGGGGGGA
""",
        )
        alignments2 = aligner.align(transcript, sequence)
        self.assertEqual(len(alignments2), 1)
        alignment2 = alignments2[0]
        self.assertEqual(
            str(alignment2),
            """\
AGGGGGCCCCCGGGGGGA
 |||||||||||||
 GGGGGCCCCCGGG
""",
        )
        alignment = alignment1.map(alignment2)
        self.assertTrue(
            numpy.array_equal(alignment.coordinates,
                              numpy.array([[0, 11], [2, 13]])))
        self.assertEqual(
            str(alignment),
            """\
  GGGCCCCCGGGGGGAAAAAAAAAA
  |||||||||||
GGGGGCCCCCGGG
""",
        )
        psl = format(alignment, "psl")
        self.assertEqual(
            psl,
            """\
11	0	0	0	0	0	0	0	+	sequence	13	2	13	chromosome	24	0	11	1	11,	2,	0,
""",
        )
コード例 #9
0
    def test_left_overhang(self):
        aligner = self.aligner
        chromosome = Seq("GGGCCCCCGGGGGGAAAAAAAAAA")
        chromosome.id = "chromosome"
        transcript = Seq("AGGGGGCCCCCGGGGGGA")
        transcript.id = "transcript"
        sequence = Seq("GGGGGCCCCCGGG")
        sequence.id = "sequence"
        alignments1 = aligner.align(chromosome, transcript)
        self.assertEqual(len(alignments1), 1)
        alignment1 = alignments1[0]
        self.assertEqual(
            str(alignment1),
            """\
   GGGCCCCCGGGGGGAAAAAAAAAA
   |||||||||||||||         
AGGGGGCCCCCGGGGGGA         
""",  # noqa: W291
        )
        alignments2 = aligner.align(transcript, sequence)
        self.assertEqual(len(alignments2), 1)
        alignment2 = alignments2[0]
        self.assertEqual(
            str(alignment2),
            """\
AGGGGGCCCCCGGGGGGA
 |||||||||||||    
 GGGGGCCCCCGGG    
""",  # noqa: W291
        )
        alignment = alignment1.map(alignment2)
        self.assertEqual(len(alignment.path), 2)
        self.assertSequenceEqual(alignment.path[0], [0, 2])
        self.assertSequenceEqual(alignment.path[1], [11, 13])
        self.assertEqual(
            str(alignment),
            """\
  GGGCCCCCGGGGGGAAAAAAAAAA
  |||||||||||             
GGGGGCCCCCGGG             
""",  # noqa: W291
        )
        psl = format(alignment, "psl")
        self.assertEqual(
            psl,
            """\
11	0	0	0	0	0	0	0	+	sequence	13	2	13	chromosome	24	0	11	1	11,	2,	0,
""",
        )
コード例 #10
0
    def test_right_overhang(self):
        aligner = self.aligner
        chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGG")
        chromosome.id = "chromosome"
        transcript = Seq("GGGGGGGCCCCCGGGGGGA")
        transcript.id = "transcript"
        sequence = Seq("GGCCCCCGGGGG")
        sequence.id = "sequence"
        alignments1 = aligner.align(chromosome, transcript)
        self.assertEqual(len(alignments1), 1)
        alignment1 = alignments1[0]
        self.assertEqual(
            str(alignment1),
            """\
AAAAAAAAAAAAGGGGGGGCCCCCGGG    
            |||||||||||||||    
            GGGGGGGCCCCCGGGGGGA
""",  # noqa: W291
        )
        alignments2 = aligner.align(transcript, sequence)
        self.assertEqual(len(alignments2), 1)
        alignment2 = alignments2[0]
        self.assertEqual(
            str(alignment2),
            """\
GGGGGGGCCCCCGGGGGGA
     ||||||||||||  
     GGCCCCCGGGGG  
""",  # noqa: W291
        )
        alignment = alignment1.map(alignment2)
        self.assertEqual(len(alignment.path), 2)
        self.assertSequenceEqual(alignment.path[0], [17, 0])
        self.assertSequenceEqual(alignment.path[1], [27, 10])
        self.assertEqual(
            str(alignment),
            """\
AAAAAAAAAAAAGGGGGGGCCCCCGGG  
                 ||||||||||  
                 GGCCCCCGGGGG
""",  # noqa: W291
        )
        psl = format(alignment, "psl")
        self.assertEqual(
            psl,
            """\
10	0	0	0	0	0	0	0	+	sequence	12	0	10	chromosome	27	17	27	1	10,	0,	17,
""",
        )
コード例 #11
0
def makeFasta(fa_path, path):
	#snps = open("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/HapMapPhase3GenotypeData/SNPMappings.txt")
	snps = open(path+"SNPAnnotation.txt")
	print "generating SNP dictionary"
	for snp in snps:
		fields = snp.strip().split("\t")
		ma = fields[5]
		
		alleles = list(set([fields[3], fields[4]]).difference([fields[5]]))
		alleles.append(fields[5])
		if snp_dict.has_key(fields[1]):
			snp_dict[ fields[1] ].append( (fields[2], alleles) )
		else:
			snp_dict[ fields[1] ] = [ (fields[2], alleles ) ]
	
	#print snp_dict
	#path = "/Users/dashazhernakova/Documents/UMCG/hg19/referenceByChr/"
	#path = "/Users/dashazhernakova/Documents/UMCG/"
	print "generating new fasta files"
	for in_f in glob.glob( os.path.join(fa_path, '*.fa') ):
		chr_num = in_f.split("/")[len(in_f.split("/")) - 1].replace(".fa", "")
		ref_fa = open(path + chr_num + "_ref.fa","w")
		alt_fa = open(path + chr_num + "_alt.fa","w")
		print "processing " + in_f
		print "processing " + chr_num
		for seq_record in SeqIO.parse(in_f, "fasta"):
			seq_ref = seq_record.seq
			seq_alt = seq_record.seq
			#id = in_f.split("/")[ len(in_f.split("/")) - 1 ].replace(".fa","") 
			id = chr_num
			for pos, al in snp_dict[id]:
				#print pos, al
				pos = int(pos)
				seq_ref = Seq(str(seq_ref[:pos - 1]) + al[0] + str(seq_ref[pos:]))
				seq_ref.id = id
				seq_alt = Seq(str(seq_alt[:pos - 1]) + al[1] + str(seq_alt[pos:]))
				seq_alt.id = id
			
		SeqIO.write([SeqRecord(seq_ref,id = seq_ref.id, description = 'ref')], ref_fa, "fasta")
		SeqIO.write([SeqRecord(seq_alt,id = seq_ref.id, description = 'alt')], alt_fa, "fasta")
#print seq
	alt_fa.close()
	ref_fa.close()
	print "finished generating fasta files"
コード例 #12
0
ファイル: HaploFunct.py プロジェクト: fennell-lab/HaploSync
def make_fasta_from_list(querylist, queryfasta, gaplen, seqoutname,
                         outfilename):

    ### Query list element format
    # Gap:	[61252	,	(0:61252)		,	gap		,	61252	,	0]
    #		[length	,	(T_start:Tstop)	, 	"gap" 	, 	length 	, 	0]
    # Object:	[b40-14.HS_iter4_seq3733|+	,	(61252:6463804)	,	93612:7595148	,	-6402552			,	4526208]
    #			[ID|strand					,	(T_start:Tstop)	,	Q_start:Q_stop	,	-(alignment length)	,	matches]

    gaplen = int(gaplen)

    seq = Seq("")
    seq.id = seqoutname

    for CompntId in querylist:

        Id, T_range, Q_range, alignment, matches = CompntId

        if not Q_range == "gap":

            CompntId_name = Id[:-2]
            Orientation = Id[-1]

            # Add gap between Components
            if str(seq) != "":
                seq = seq + "N" * gaplen

            if Orientation == "-":
                my_sub_seq = queryfasta[CompntId_name].reverse_complement()
            else:
                my_sub_seq = queryfasta[CompntId_name]

            seq = seq + my_sub_seq

    # Print the entire sequence
    seq.id = seqoutname
    seq.description = ""
    print >> outfilename, seq.format('fasta')
コード例 #13
0
def test_random(aligner, nBlocks1=1, nBlocks2=1, strand1="+", strand2="+"):
    chromosome = "".join(["ACGT"[random.randint(0, 3)] for i in range(1000)])
    nBlocks = nBlocks1
    transcript = ""
    position = 0
    for i in range(nBlocks):
        position += random.randint(60, 80)
        blockSize = random.randint(60, 80)
        transcript += chromosome[position:position + blockSize]
        position += blockSize
    nBlocks = nBlocks2
    sequence = ""
    position = 0
    for i in range(nBlocks):
        position += random.randint(20, 40)
        blockSize = random.randint(20, 40)
        sequence += transcript[position:position + blockSize]
        position += blockSize
    chromosome = Seq(chromosome)
    transcript = Seq(transcript)
    sequence = Seq(sequence)
    if strand1 == "-":
        chromosome = chromosome.reverse_complement()
    if strand2 == "-":
        sequence = sequence.reverse_complement()
    chromosome.id = "chromosome"
    transcript.id = "transcript"
    sequence.id = "sequence"
    alignments1 = aligner.align(chromosome, transcript, strand=strand1)
    alignment1 = alignments1[0]
    alignments2 = aligner.align(transcript, sequence, strand=strand2)
    alignment2 = alignments2[0]
    alignment = alignment1.map(alignment2)
    psl_check = map_check(alignment1, alignment2)
    psl = format(alignment, "psl")
    assert psl == psl_check
    print("Randomized test %d, %d, %s, %s OK" %
          (nBlocks1, nBlocks2, strand1, strand2))
コード例 #14
0
                      record.id)
            continue
        elif record.id in trims:
            trimloc = trims[record.id]
            print(record.id, "before is ", len(record), "long")
            if len(trimloc) > 1:
                print("more than one trim location ... maybe a chimera?",
                      record.id)
            else:
                locs = trimloc[0].split("..")
                left = int(locs[0]) - 1
                right = int(locs[0])
                if left == 0:
                    record = record[right - 1:]
                elif right == len(record):
                    record = record[:left]
                else:
                    # internal slicing
                    temprecord = Seq(record[:left] + record[right - 1:],
                                     DNAAlphabet())
                    temprecord.id = record.id
                    print(record.id, len(record))
                    record = temprecord
                    print(len(record))
            print(record.id, "after is ", len(record), "long")
        elif record.id in duplicates:
            log.write("Skipping %s as is considered a duplicate\n" % record.id)
            continue

        SeqIO.write(record, output_handle, "fasta")
コード例 #15
0
ファイル: plot_bppm.py プロジェクト: Godzilla-Q/godzilla
#!/usr/bin/env python

"""calculate and plot the base pair probability matrix"""

# from __future__ import print_function

import RNA
import matplotlib
import Bio
import matplotlib.pyplot as plt
from Bio.Seq import Seq
from ss_dotplot import versions_used

# Define sample sequence
godzilla = Seq('GAGACCCGTAAAAGGGTCTCGAAAGTGTGTAAAAAACACAC')
godzilla.id = 'Godzilla Queen of Monsters'
#foldgod = RNA.fold_compound(str(godzilla))
Hirsch = Seq('CCGCACAGCGGGCAGUGCCC')
Hirsch.id = 'Papa Hirsch protects us all'
#foldHirsch = RNA.fold_compound(str(Hirsch))

monsters = (godzilla, Hirsch)
# use either 'BuPu' or 'Greys'
colormap = 'BuPu'

def plot_bppm ( bppm, name ):
# plot base pair probability matrix, write plot to post script file
    plt.matshow(bppm, fignum=name, cmap=plt.get_cmap(colormap))
    plt.savefig('{:s}.ps'.format(name), format='ps')
    plt.close()
    return
コード例 #16
0
    def test_sort(self):
        target = Seq("ACTT")
        query = Seq("ACCT")
        sequences = (target, query)
        coordinates = numpy.array([[0, 4], [0, 4]])
        alignment = Align.Alignment(sequences, coordinates)
        self.assertEqual(
            str(alignment),
            """\
ACTT
||.|
ACCT
""",
        )
        alignment.sort()
        self.assertEqual(
            str(alignment),
            """\
ACCT
||.|
ACTT
""",
        )
        alignment.sort(reverse=True)
        self.assertEqual(
            str(alignment),
            """\
ACTT
||.|
ACCT
""",
        )
        target.id = "seq1"
        query.id = "seq2"
        alignment.sort()
        self.assertEqual(
            str(alignment),
            """\
ACTT
||.|
ACCT
""",
        )
        alignment.sort(reverse=True)
        self.assertEqual(
            str(alignment),
            """\
ACCT
||.|
ACTT
""",
        )
        alignment.sort(key=GC)
        self.assertEqual(
            str(alignment),
            """\
ACTT
||.|
ACCT
""",
        )
        alignment.sort(key=GC, reverse=True)
        self.assertEqual(
            str(alignment),
            """\
ACCT
||.|
ACTT
""",
        )
コード例 #17
0
def write_unmapped_scaffold(outblocks, scaffold, stats, unmapped_output, genome, chromosomes):

    seq = Seq('')
    dbblocks = []
    mapped = False
    scaffold_chromosomes = []
    for block in outblocks:
        if genome.revised_db:
            dbblocks.append([block.chromosome, block.cm, scaffold, block.start, block.end, block.length])                
        if genome.revised_fasta:
            seq += genome.sequences[scaffold][block.start-1:block.end]
        if block.chromosome != '0':
            scaffold_chromosomes.append(int(block.chromosome))


    stats['scaffolds'] += 1

    scaffold_chromosomes = set(scaffold_chromosomes)
    scaffold_start, scaffold_end = outblocks[0].start, outblocks[-1].end
    scaffold_length = scaffold_end - scaffold_start + 1
    stats['scaffold_length'] += scaffold_length

    if seq.features or len(scaffold_chromosomes) > 0 and len(dbblocks) > 1:
        unmapped_output.append([gd.Block(scaffold, scaffold_start, scaffold_end)])

        scaffold_name = genome.revised + "{:05d}".format(genome.revised_count)
        genome.revised_count += 1
        genome.revised_names["{}_{}_{}".format(scaffold, scaffold_start, scaffold_end)] = scaffold_name

        stats['written_scaffolds'] += 1
        stats['written_length'] += scaffold_length
        if genome.revised_db:
            for block in dbblocks:
                genome.revised_db.execute("insert into scaffold_map values (?,?,?,?,?,?)", block)
        if genome.revised_fasta:
            seq.description = "length={}".format(len(seq))
            seq.id = scaffold_name
            SeqIO.write(seq, genome.revised_fasta, "fasta")

        if len(scaffold_chromosomes) > 0:
            chrom = next(iter(scaffold_chromosomes))
            chr_unmapped_end = chromosomes[chrom].unmapped_start + scaffold_length - 1 
            chromosomes[chrom].agp.append("{}\t{}\t{}\t{}\tD\t{}\t1\t{}\t+\n".format("chr{}_unmapped".format(chrom), chromosomes[chrom].unmapped_start, chr_unmapped_end, chromosomes[chrom].unmapped_part, scaffold_name, scaffold_length))
            chromosomes[chrom].unmapped_part += 1
            chromosomes[chrom].agp.append("{}\t{}\t{}\t{}\tN\t100\tfragment\tno\n".format("chr{}_unmapped".format(chrom), chr_unmapped_end+1, chr_unmapped_end+100, chromosomes[chrom].unmapped_part))
            chromosomes[chrom].unmapped_part += 1
            chromosomes[chrom].unmapped_start = chr_unmapped_end + 101
    else:
        stats['discard_scaffolds'] += 1
        stats['discard_length'] += scaffold_length
        for dbblock in dbblocks:
            dblength = dbblock[5]
            partslength = 0
            for newpart in genome.newparts[scaffold]:
                for origpart in genome.origparts[newpart.oldname]:
                    if dbblock[2] == origpart.newname and (dbblock[3] <= origpart.newstart <= dbblock[4] or dbblock[3] <= origpart.newend <= dbblock[4]):
                        if origpart.parttype in ['active', 'retained']:
                            partslength += origpart.newend - origpart.newstart + 1
                            origpart.parttype = 'removed'
            if dblength != partslength:
                print(scaffold, dblength, partslength, dbblock)
コード例 #18
0
            if len(trimloc) > 1:
                trimloc = sorted(merge_intervals(trimloc),
                                 reverse=True,
                                 key=lambda locitem: locitem[0])

            seqlen = len(record)
            for loc in trimloc:
                left = int(loc[0]) - 1
                right = int(loc[1])
                newrecord = Seq("", generic_dna)
                log.write("trimming %d to %d in %s len=%d" %
                          (left, right, record.id, len(record)))
                if left == 0:
                    newrecord = record[right - 1:]
                elif right == len(record):
                    newrecord = record[:left]
                else:
                    # internal slicing
                    log.write("-->internal slicing :%d .. %d:" %
                              (left, right - 1))
                    log.write('  left string is %s' % record[0:left])
                    log.write('  right string is %s' % record[right - 1:])
                    newrecord = record[0:left] + record[right - 1:]
                    newrecord.id = record.id
                    log.write(" -- new len for %s is %d: %s" %
                              (newrecord.id, len(newrecord), newrecord))
                record = newrecord

        if (len(record) >= 200):
            SeqIO.write(record, output_handle, "fasta")
コード例 #19
0
def write_unmapped_scaffold(outblocks, scaffold, stats, unmapped_output,
                            genome, chromosomes):

    seq = Seq('')
    dbblocks = []
    mapped = False
    scaffold_chromosomes = []
    for block in outblocks:
        if genome.revised_db:
            dbblocks.append([
                block.chromosome, block.cm, scaffold, block.start, block.end,
                block.length
            ])
        if genome.revised_fasta:
            seq += genome.sequences[scaffold][block.start - 1:block.end]
        if block.chromosome != '0':
            scaffold_chromosomes.append(int(block.chromosome))

    stats['scaffolds'] += 1

    scaffold_chromosomes = set(scaffold_chromosomes)
    scaffold_start, scaffold_end = outblocks[0].start, outblocks[-1].end
    scaffold_length = scaffold_end - scaffold_start + 1
    stats['scaffold_length'] += scaffold_length

    if seq.features or len(scaffold_chromosomes) > 0 and len(dbblocks) > 1:
        unmapped_output.append(
            [gd.Block(scaffold, scaffold_start, scaffold_end)])

        scaffold_name = genome.revised + "{:05d}".format(genome.revised_count)
        genome.revised_count += 1
        genome.revised_names["{}_{}_{}".format(scaffold, scaffold_start,
                                               scaffold_end)] = scaffold_name

        stats['written_scaffolds'] += 1
        stats['written_length'] += scaffold_length
        if genome.revised_db:
            for block in dbblocks:
                genome.revised_db.execute(
                    "insert into scaffold_map values (?,?,?,?,?,?)", block)
        if genome.revised_fasta:
            seq.description = "length={}".format(len(seq))
            seq.id = scaffold_name
            SeqIO.write(seq, genome.revised_fasta, "fasta")

        if len(scaffold_chromosomes) > 0:
            chrom = next(iter(scaffold_chromosomes))
            chr_unmapped_end = chromosomes[
                chrom].unmapped_start + scaffold_length - 1
            chromosomes[chrom].agp.append(
                "{}\t{}\t{}\t{}\tD\t{}\t1\t{}\t+\n".format(
                    "chr{}_unmapped".format(chrom),
                    chromosomes[chrom].unmapped_start, chr_unmapped_end,
                    chromosomes[chrom].unmapped_part, scaffold_name,
                    scaffold_length))
            chromosomes[chrom].unmapped_part += 1
            chromosomes[chrom].agp.append(
                "{}\t{}\t{}\t{}\tN\t100\tfragment\tno\n".format(
                    "chr{}_unmapped".format(chrom), chr_unmapped_end + 1,
                    chr_unmapped_end + 100, chromosomes[chrom].unmapped_part))
            chromosomes[chrom].unmapped_part += 1
            chromosomes[chrom].unmapped_start = chr_unmapped_end + 101
    else:
        stats['discard_scaffolds'] += 1
        stats['discard_length'] += scaffold_length
        for dbblock in dbblocks:
            dblength = dbblock[5]
            partslength = 0
            for newpart in genome.newparts[scaffold]:
                for origpart in genome.origparts[newpart.oldname]:
                    if dbblock[2] == origpart.newname and (
                            dbblock[3] <= origpart.newstart <= dbblock[4]
                            or dbblock[3] <= origpart.newend <= dbblock[4]):
                        if origpart.parttype in ['active', 'retained']:
                            partslength += origpart.newend - origpart.newstart + 1
                            origpart.parttype = 'removed'
            if dblength != partslength:
                print(scaffold, dblength, partslength, dbblock)