Beispiel #1
0
    def testSummaryX4(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_2,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTATGAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_2,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACGAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_3,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTATGAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_3,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACGAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,2,0.454349263704,2.6,X4,CMRPNNNTRKSIHIGPGRAFYATGEIIGDIRRAHC,CMRPN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RRAHC,,
2,1,0.0677537070158,42.3,R5,CTRPNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC,CTRPN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RQAHC,,
"""
        expected_summary_csv = """\
mapped,valid,X4calls,X4pct,final
3,3,2,66.67,X4
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv,
                self.g2p_summary_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
        self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
Beispiel #2
0
    def testPartialCodon(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,8M,=,877,8,TGTACAGG,AAAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,8M,=,877,-8,TGTACAGG,AAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,,,notdiv3,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #3
0
    def testLowQuality(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TNTNNNGGN,A#A###AA#
Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TNTNNNGGN,A#A###AA#
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,,,low quality,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #4
0
    def testOverlap(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,12M,=,886,12,TGTACAAGACCC,AAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,886,44,9M,=,877,-9,CCCAACAAC,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CTRPNN,,cysteines,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #5
0
    def testDeletionAtStart(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,874,44,3M3D6M,=,874,9,TGTGGGTGT,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,874,44,3M3D6M,=,874,-9,TGTGGGTGT,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,-GC,,cysteines,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #6
0
    def testOverlap(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,12M,=,886,12,TGTACAAGACCC,AAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,886,44,9M,=,877,-9,CCCAACAAC,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CTRPNN,,cysteines,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #7
0
    def testLowQuality(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TNTNNNGGN,A#A###AA#
Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TNTNNNGGN,A#A###AA#
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,,,low quality,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #8
0
    def testPartialCodon(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,8M,=,877,8,TGTACAGG,AAAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,8M,=,877,-8,TGTACAGG,AAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,,,notdiv3,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #9
0
    def testLengthMinimum(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,51M,=,925,51,TGTGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,925,44,48M,=,877,-48,AAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,0.806326707173,1.5,X4,CGGGGGGGGGGGGGGGKGGGGGGGGGGGGGGC,---CG-GGG--GGGGGG---GGGG---GKGGG----GGGGGGG--GGGGC,,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #10
0
    def testDeletionAtStart(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,874,44,3M3D6M,=,874,9,TGTGGGTGT,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,874,44,3M3D6M,=,874,-9,TGTGGGTGT,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,-GC,,cysteines,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #11
0
    def testLengthTooShort(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,51M,=,925,51,TGTGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,925,44,45M,=,877,-45,AAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CGGGGGGGGGGGGGGGKGGGGGGGGGGGGGC,,length,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #12
0
    def testLengthTooShort(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,51M,=,925,51,TGTGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,925,44,45M,=,877,-45,AAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CGGGGGGGGGGGGGGGKGGGGGGGGGGGGGC,,length,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #13
0
    def testStopCodon(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTTAGTGT,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTTAGTGT,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,C*C,,stop codons,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #14
0
    def testLengthMinimum(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,51M,=,925,51,TGTGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,925,44,48M,=,877,-48,AAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,0.806326707173,1.5,X4,CGGGGGGGGGGGGGGGKGGGGGGGGGGGGGGC,---CG-GGG--GGGGGG---GGGG---GKGGG----GGGGGGG--GGGGC,,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #15
0
    def testStopCodon(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTTAGTGT,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTTAGTGT,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,C*C,,stop codons,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #16
0
    def testAmbiguousAtTwoPositions(self):
        """ Same thing with codons 9 and 18 - rejected. """
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAA#AAAAAAAAAAAAAAAAAAAAAAAAAA#AA
Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAA#AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CTRPNNNTXKSIHIGPGXAFYATGEIIGDIRQAHC,,> 2 ambiguous,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #17
0
    def testAllClipped(self):
        """ In this scenario, the reads map outside the clipping region. """
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,868,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,868,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,,,zerolength,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #18
0
    def testAllClipped(self):
        """ In this scenario, the reads map outside the clipping region. """
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,868,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,868,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,,,zerolength,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #19
0
    def testAmbiguousAtTwoPositions(self):
        """ Same thing with codons 9 and 18 - rejected. """
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAA#AAAAAAAAAAAAAAAAAAAAAAAAAA#AA
Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAA#AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CTRPNNNTXKSIHIGPGXAFYATGEIIGDIRQAHC,,> 2 ambiguous,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #20
0
    def testAmbiguousMixture(self):
        """ Marking position 9 as low quality means codon 3 could be S or R.
        """
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAA#AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,0.0663051848427,43.0,R5,CT[RS]PNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC,CT[RS]PN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RQAHC,,ambiguous
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #21
0
    def testAmbiguousMixture(self):
        """ Marking position 9 as low quality means codon 3 could be S or R.
        """
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAA#AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,0.0663051848427,43.0,R5,CT[RS]PNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC,CT[RS]PN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RQAHC,,ambiguous
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #22
0
    def testAmbiguousMixtureThreeChoices(self):
        """ Marking position 14 as low quality means codon 5 could be L, S, or *.
        """
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,21M,=,877,56,TGTACAAGACCCTTAAACTGT,AAAAAAAAAAAAA#AAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,21M,=,877,56,TGTACAAGACCCTTAAACTGT,AAAAAAAAAAAAA#AAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CTRPXNC,,> 2 ambiguous,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #23
0
    def testAmbiguousMixtureThreeChoices(self):
        """ Marking position 14 as low quality means codon 5 could be L, S, or *.
        """
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,21M,=,877,56,TGTACAAGACCCTTAAACTGT,AAAAAAAAAAAAA#AAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,21M,=,877,56,TGTACAAGACCCTTAAACTGT,AAAAAAAAAAAAA#AAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CTRPXNC,,> 2 ambiguous,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #24
0
def test(remap_lines, temp_prefix, pssm, ruby_script, delete_results=True):
    """ Calculate G2P scores using ruby_script and Python, then compare.
    
    @return: 'PASS' if the results match or it's a difference we're not
        interested in, 'FAIL' otherwise
    """
    with NamedTemporaryFile(suffix=".csv", prefix=temp_prefix,
                            delete=True) as remap_file:
        for line in remap_lines:
            remap_file.write(line)
        remap_file.flush()
        remap_file.seek(0)

        filename_root = os.path.splitext(os.path.splitext(
            remap_file.name)[0])[0]
        nuc_filename = filename_root + ".nuc.csv"
        ruby_out_filename = filename_root + "_rbg2p.csv"
        python_out_filename = filename_root + "_pyg2p.csv"
        ruby_path = os.path.dirname(ruby_script)

        try:
            check_call([
                ruby_script, remap_file.name, nuc_filename, ruby_out_filename
            ],
                       cwd=ruby_path)
            with open(nuc_filename, 'rU') as nuc_csv, \
                 open(python_out_filename, 'wb') as g2p_csv:

                sam_g2p(pssm, remap_file, nuc_csv, g2p_csv)

            with open(os.devnull, 'w') as devnull:
                is_diff = call(
                    ['diff', '-q', ruby_out_filename, python_out_filename],
                    stdout=devnull)
            result = 'FAIL' if is_diff else 'PASS'
            logger.info('{} lines: {}'.format(len(remap_lines), result))
            return result
        finally:
            if delete_results:
                if os.path.exists(ruby_out_filename):
                    os.remove(ruby_out_filename)
                if os.path.exists(python_out_filename):
                    os.remove(python_out_filename)
Beispiel #25
0
    def testVariants(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
Example_read_2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
Example_read_2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
Example_read_3,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA
Example_read_3,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,2,,,,CTR,,cysteines,
2,1,,,,CTG,,cysteines,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #26
0
    def testVariants(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
Example_read_2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
Example_read_2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
Example_read_3,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA
Example_read_3,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,2,,,,CTR,,cysteines,
2,1,,,,CTG,,cysteines,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
Beispiel #27
0
    def testSummaryFailed(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CTR,,cysteines,
"""
        expected_summary_csv = """\
mapped,valid,X4calls,X4pct,final
1,0,0,,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv, self.g2p_summary_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
        self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
Beispiel #28
0
    def testSummarySuccess(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,0.0677537070158,42.3,R5,CTRPNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC,CTRPN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RQAHC,,
"""
        expected_summary_csv = """\
mapped,valid,X4calls,X4pct,final
1,1,0,0.00,R5
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv, self.g2p_summary_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
        self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
Beispiel #29
0
    def testSummaryFailed(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,1,,,,CTR,,cysteines,
"""
        expected_summary_csv = """\
mapped,valid,X4calls,X4pct,final
1,0,0,,
"""

        sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv,
                self.g2p_summary_csv)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
        self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
Beispiel #30
0
def test(remap_lines, temp_prefix, pssm, ruby_script, delete_results=True):
    """ Calculate G2P scores using ruby_script and Python, then compare.
    
    @return: 'PASS' if the results match or it's a difference we're not
        interested in, 'FAIL' otherwise
    """
    with NamedTemporaryFile(suffix=".csv", prefix=temp_prefix, delete=True) as remap_file:
        for line in remap_lines:
            remap_file.write(line)
        remap_file.flush()
        remap_file.seek(0)
        
        filename_root = os.path.splitext(os.path.splitext(remap_file.name)[0])[0]
        nuc_filename = filename_root + ".nuc.csv"
        ruby_out_filename = filename_root + "_rbg2p.csv"
        python_out_filename = filename_root + "_pyg2p.csv"
        ruby_path = os.path.dirname(ruby_script)
        
        try:
            check_call([ruby_script, remap_file.name, nuc_filename, ruby_out_filename],
                       cwd=ruby_path)
            with open(nuc_filename, 'rU') as nuc_csv, \
                 open(python_out_filename, 'wb') as g2p_csv:
                
                sam_g2p(pssm, remap_file, nuc_csv, g2p_csv)
            
            with open(os.devnull, 'w') as devnull:
                is_diff = call(['diff', '-q', ruby_out_filename, python_out_filename],
                               stdout=devnull)
            result = 'FAIL' if is_diff else 'PASS'
            logger.info('{} lines: {}'.format(len(remap_lines), result))
            return result
        finally:
            if delete_results:
                if os.path.exists(ruby_out_filename):
                    os.remove(ruby_out_filename)
                if os.path.exists(python_out_filename):
                    os.remove(python_out_filename)
Beispiel #31
0
    def testMinCount(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
variant1_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
variant1_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
variant1_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
variant1_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
variant1_read3,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
variant1_read3,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
variant2_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA
variant2_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA
variant2_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA
variant2_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA
variant3_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGAA,AAAAAAAAA
variant3_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGAA,AAAAAAAAA
variant3_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGAA,AAAAAAAAA
variant3_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGAA,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,3,,,,CTR,,cysteines,
2,4,,,,,,count < 3,
"""
        expected_summary_csv = """\
mapped,valid,X4calls,X4pct,final
7,0,0,,
"""

        sam_g2p(self.pssm,
                remap_csv,
                self.nuc_csv,
                self.g2p_csv,
                self.g2p_summary_csv,
                min_count=3)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
        self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
Beispiel #32
0
    def testMinCount(self):
        remap_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
variant1_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
variant1_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
variant1_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
variant1_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
variant1_read3,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA
variant1_read3,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA
variant2_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA
variant2_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA
variant2_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA
variant2_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA
variant3_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGAA,AAAAAAAAA
variant3_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGAA,AAAAAAAAA
variant3_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGAA,AAAAAAAAA
variant3_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGAA,AAAAAAAAA
""")
        expected_g2p_csv = """\
rank,count,g2p,fpr,call,seq,aligned,error,comment
1,3,,,,CTR,,cysteines,
2,4,,,,,,count < 3,
"""
        expected_summary_csv = """\
mapped,valid,X4calls,X4pct,final
7,0,0,,
"""

        sam_g2p(self.pssm,
                remap_csv,
                self.nuc_csv,
                self.g2p_csv,
                self.g2p_summary_csv,
                min_count=3)

        self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
        self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
Beispiel #33
0
 def process_sample(self, fastq1, progress, prefixes, image_paths, error_log):
     fastq2 = fastq1.replace('_R1_001', '_R2_001').replace('censored1',
                                                           'censored2')
     if not os.path.exists(fastq2):
         raise IOError('ERROR: Missing R2 file for {}'.format(fastq1))
     
     prefix = os.path.basename(fastq1).replace('_L001_R1_001.fastq',
                                               '').replace('.censored1.fastq',
                                                           '')
     prefixes.append(prefix)
     output_csv = prefix + '.prelim.csv'
     self.write('Processing sample {} ({})\n'.format(prefix, progress))
     with open(output_csv, 'wb') as handle:
         prelim_map(fastq1,
                    fastq2,
                    handle,
                    nthreads=self.nthreads,
                    callback=self.callback,
                    stderr=error_log)
     
     # prepare file handles for remap stage
     with open(output_csv, 'rU') as prelim_csv, \
          open(os.path.join(self.workdir, prefix + '.remap.csv'), 'wb') as remap_csv, \
          open(os.path.join(self.workdir, prefix + '.remap_counts.csv'), 'wb') as counts_csv, \
          open(os.path.join(self.workdir, prefix + '.remap_conseq.csv'), 'wb') as conseq_csv, \
          open(os.path.join(self.workdir, prefix + '.unmapped1.fastq'), 'w') as unmapped1, \
          open(os.path.join(self.workdir, prefix + '.unmapped2.fastq'), 'w') as unmapped2:
         
         self.write('... remapping\n')
         self.parent.update()
         self.progress_bar['value'] = 0
         remap(fastq1,
               fastq2,
               prelim_csv,
               remap_csv,
               counts_csv,
               conseq_csv,
               unmapped1,
               unmapped2,
               self.workdir,
               nthreads=self.nthreads,
               callback=self.callback,
               stderr=error_log)
         
     # prepare file handles for conversion from SAM format to alignment
     with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \
          open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'wb') as aligned_csv, \
          open(os.path.join(self.workdir, prefix + '.insert.csv'), 'wb') as insert_csv, \
          open(os.path.join(self.workdir, prefix + '.failed.csv'), 'wb') as failed_csv:
         
         self.write('... converting into alignment\n')
         self.parent.update()
         sam2aln(remap_csv,
                 aligned_csv,
                 insert_csv,
                 failed_csv,
                 nthreads=self.nthreads)
         
     with open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'rU') as aligned_csv, \
          open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'wb') as nuc_csv, \
          open(os.path.join(self.workdir, prefix + '.amino.csv'), 'wb') as amino_csv, \
          open(os.path.join(self.workdir, prefix + '.coord_ins.csv'), 'wb') as coord_ins_csv, \
          open(os.path.join(self.workdir, prefix + '.conseq.csv'), 'wb') as conseq_csv, \
          open(os.path.join(self.workdir, prefix + '.failed_align.csv'), 'wb') as failed_align_csv, \
          open(os.path.join(self.workdir, prefix + '.nuc_variants.csv'), 'wb') as nuc_variants_csv:
         
         self.parent.update()
         aln2counts(aligned_csv,
                    nuc_csv,
                    amino_csv,
                    coord_ins_csv,
                    conseq_csv,
                    failed_align_csv,
                    nuc_variants_csv,
                    callback=self.callback)
         
     self.write('... generating coverage plots\n')
     self.parent.update()
     with open(os.path.join(self.workdir, prefix + '.amino.csv'), 'rU') as amino_csv:
         image_paths += coverage_plot(amino_csv)
     self.write('... performing g2p scoring on samples covering HIV-1 V3\n')
     self.parent.update()
     with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \
          open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'rU') as nuc_csv, \
          open(os.path.join(self.workdir, prefix + '.g2p.csv'), 'wb') as g2p_csv:
         
         sam_g2p(pssm=self.pssm,
                 remap_csv=remap_csv,
                 nuc_csv=nuc_csv,
                 g2p_csv=g2p_csv)
Beispiel #34
0
    def process_sample(self, fastq1, progress, prefixes, image_paths,
                       error_log):
        fastq2 = fastq1.replace('_R1_001',
                                '_R2_001').replace('censored1', 'censored2')
        if not os.path.exists(fastq2):
            raise IOError('ERROR: Missing R2 file for {}'.format(fastq1))

        prefix = os.path.basename(fastq1).replace('_L001_R1_001.fastq',
                                                  '').replace(
                                                      '.censored1.fastq', '')
        prefixes.append(prefix)
        output_csv = prefix + '.prelim.csv'
        self.write('Processing sample {} ({})\n'.format(prefix, progress))
        with open(output_csv, 'wb') as handle:
            prelim_map(fastq1,
                       fastq2,
                       handle,
                       nthreads=self.nthreads,
                       callback=self.callback,
                       stderr=error_log)

        # prepare file handles for remap stage
        with open(output_csv, 'rU') as prelim_csv, \
             open(os.path.join(self.workdir, prefix + '.remap.csv'), 'wb') as remap_csv, \
             open(os.path.join(self.workdir, prefix + '.remap_counts.csv'), 'wb') as counts_csv, \
             open(os.path.join(self.workdir, prefix + '.remap_conseq.csv'), 'wb') as conseq_csv, \
             open(os.path.join(self.workdir, prefix + '.unmapped1.fastq'), 'w') as unmapped1, \
             open(os.path.join(self.workdir, prefix + '.unmapped2.fastq'), 'w') as unmapped2:

            self.write('... remapping\n')
            self.parent.update()
            self.progress_bar['value'] = 0
            remap(fastq1,
                  fastq2,
                  prelim_csv,
                  remap_csv,
                  counts_csv,
                  conseq_csv,
                  unmapped1,
                  unmapped2,
                  self.workdir,
                  nthreads=self.nthreads,
                  callback=self.callback,
                  stderr=error_log)

        # prepare file handles for conversion from SAM format to alignment
        with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \
             open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'wb') as aligned_csv, \
             open(os.path.join(self.workdir, prefix + '.insert.csv'), 'wb') as insert_csv, \
             open(os.path.join(self.workdir, prefix + '.failed.csv'), 'wb') as failed_csv:

            self.write('... converting into alignment\n')
            self.parent.update()
            sam2aln(remap_csv,
                    aligned_csv,
                    insert_csv,
                    failed_csv,
                    nthreads=self.nthreads)

        with open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'rU') as aligned_csv, \
             open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'wb') as nuc_csv, \
             open(os.path.join(self.workdir, prefix + '.amino.csv'), 'wb') as amino_csv, \
             open(os.path.join(self.workdir, prefix + '.coord_ins.csv'), 'wb') as coord_ins_csv, \
             open(os.path.join(self.workdir, prefix + '.conseq.csv'), 'wb') as conseq_csv, \
             open(os.path.join(self.workdir, prefix + '.failed_align.csv'), 'wb') as failed_align_csv, \
             open(os.path.join(self.workdir, prefix + '.nuc_variants.csv'), 'wb') as nuc_variants_csv:

            self.parent.update()
            aln2counts(aligned_csv,
                       nuc_csv,
                       amino_csv,
                       coord_ins_csv,
                       conseq_csv,
                       failed_align_csv,
                       nuc_variants_csv,
                       callback=self.callback)

        self.write('... generating coverage plots\n')
        self.parent.update()
        with open(os.path.join(self.workdir, prefix + '.amino.csv'),
                  'rU') as amino_csv:
            image_paths += coverage_plot(amino_csv)
        self.write('... performing g2p scoring on samples covering HIV-1 V3\n')
        self.parent.update()
        with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \
             open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'rU') as nuc_csv, \
             open(os.path.join(self.workdir, prefix + '.g2p.csv'), 'wb') as g2p_csv:

            sam_g2p(pssm=self.pssm,
                    remap_csv=remap_csv,
                    nuc_csv=nuc_csv,
                    g2p_csv=g2p_csv)
Beispiel #35
0
def process_sample(sample_index, run_info, data_path, pssm):
    """ Process a single sample.

    :param sample_index: which sample to process from the session JSON
    :param run_info: run parameters loaded from the session JSON
    :param str data_path: the root folder for all BaseSpace data
    :param pssm: the pssm library for running G2P analysis
    """
    scratch_path = os.path.join(data_path, 'scratch')
    sample_info = run_info.samples[sample_index]
    sample_id = sample_info['Id']
    sample_name = sample_info['Name']
    sample_dir = os.path.join(data_path,
                              'input',
                              'samples',
                              sample_id,
                              'Data',
                              'Intensities',
                              'BaseCalls')
    if not os.path.exists(sample_dir):
        sample_dir = os.path.join(data_path,
                                  'input',
                                  'samples',
                                  sample_id)
    sample_path = None
    for root, _dirs, files in os.walk(sample_dir):
        sample_paths = fnmatch.filter(files, '*_R1_*')
        if sample_paths:
            sample_path = os.path.join(root, sample_paths[0])
            break
    if sample_path is None:
        raise RuntimeError('No R1 file found for sample id {}.'.format(sample_id))
    sample_path2 = sample_path.replace('_R1_', '_R2_')
    if not os.path.exists(sample_path2):
        raise RuntimeError('R2 file missing for sample id {}: {!r}.'.format(
            sample_id,
            sample_path2))
    logger.info('Processing sample %s (%d of %d): %s (%s).',
                sample_id,
                sample_index+1,
                len(run_info.samples),
                sample_name,
                sample_path)

    sample_out_path = create_app_result(data_path,
                                        run_info,
                                        sample_info,
                                        description='Mapping results',
                                        suffix='_QC')

    sample_scratch_path = os.path.join(scratch_path, sample_name)
    makedirs(sample_scratch_path)

    censored_path1 = os.path.join(sample_scratch_path, 'censored1.fastq')
    read_summary_path1 = os.path.join(sample_scratch_path, 'read1_summary.csv')
    censor_sample(sample_path,
                  os.path.join(scratch_path, 'bad_cycles.csv'),
                  censored_path1,
                  read_summary_path1)
    censored_path2 = os.path.join(sample_scratch_path, 'censored2.fastq')
    read_summary_path2 = os.path.join(sample_scratch_path, 'read2_summary.csv')
    censor_sample(sample_path2,
                  os.path.join(scratch_path, 'bad_cycles.csv'),
                  censored_path2,
                  read_summary_path2)

    logger.info('Running prelim_map (%d of %d).', sample_index+1, len(run_info.samples))
    with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'wb') as prelim_csv:
        prelim_map(censored_path1,
                   censored_path2,
                   prelim_csv)

    logger.info('Running remap (%d of %d).', sample_index+1, len(run_info.samples))
    with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'rU') as prelim_csv, \
            open(os.path.join(sample_scratch_path, 'remap.csv'), 'wb') as remap_csv, \
            open(os.path.join(sample_out_path, 'remap_counts.csv'), 'wb') as counts_csv, \
            open(os.path.join(sample_out_path, 'remap_conseq.csv'), 'wb') as conseq_csv, \
            open(os.path.join(sample_out_path, 'unmapped1.fastq'), 'w') as unmapped1, \
            open(os.path.join(sample_out_path, 'unmapped2.fastq'), 'w') as unmapped2:

        remap(censored_path1,
              censored_path2,
              prelim_csv,
              remap_csv,
              counts_csv,
              conseq_csv,
              unmapped1,
              unmapped2,
              sample_scratch_path,
              nthreads=1)

    logger.info('Running sam2aln (%d of %d).', sample_index+1, len(run_info.samples))
    with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \
            open(os.path.join(sample_scratch_path, 'aligned.csv'), 'wb') as aligned_csv, \
            open(os.path.join(sample_out_path, 'conseq_ins.csv'), 'wb') as insert_csv, \
            open(os.path.join(sample_out_path, 'failed_read.csv'), 'wb') as failed_csv:

        sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv)

    logger.info('Running aln2counts (%d of %d).', sample_index+1, len(run_info.samples))
    with open(os.path.join(sample_scratch_path, 'aligned.csv'), 'rU') as aligned_csv, \
            open(os.path.join(sample_out_path, 'nuc.csv'), 'wb') as nuc_csv, \
            open(os.path.join(sample_out_path, 'amino.csv'), 'wb') as amino_csv, \
            open(os.path.join(sample_out_path, 'coord_ins.csv'), 'wb') as coord_ins_csv, \
            open(os.path.join(sample_out_path, 'conseq.csv'), 'wb') as conseq_csv, \
            open(os.path.join(sample_out_path, 'failed_align.csv'), 'wb') as failed_align_csv, \
            open(os.path.join(sample_out_path, 'nuc_variants.csv'), 'wb') as nuc_variants_csv, \
            open(os.path.join(sample_scratch_path, 'coverage_summary.csv'), 'wb') as coverage_summary_csv:

        aln2counts(aligned_csv,
                   nuc_csv,
                   amino_csv,
                   coord_ins_csv,
                   conseq_csv,
                   failed_align_csv,
                   nuc_variants_csv,
                   coverage_summary_csv=coverage_summary_csv)

    logger.info('Running coverage_plots (%d of %d).', sample_index+1, len(run_info.samples))
    coverage_path = os.path.join(sample_out_path, 'coverage')
    with open(os.path.join(sample_out_path, 'amino.csv'), 'rU') as amino_csv, \
            open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'w') as coverage_scores_csv:
        coverage_plot(amino_csv, coverage_scores_csv, path_prefix=coverage_path)

    with open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'rU') as coverage_scores_csv:
        reader = csv.DictReader(coverage_scores_csv)
        is_v3loop_good = False
        for row in reader:
            if row['region'] == 'V3LOOP':
                is_v3loop_good = row['on.score'] == '4'
                break

    if is_v3loop_good:
        logger.info('Running sam_g2p (%d of %d).', sample_index+1, len(run_info.samples))
        g2p_path = create_app_result(data_path,
                                     run_info,
                                     sample_info,
                                     description='Geno To Pheno results',
                                     suffix='_G2P')
        with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \
                open(os.path.join(sample_out_path, 'nuc.csv'), 'rU') as nuc_csv, \
                open(os.path.join(g2p_path, 'g2p.csv'), 'wb') as g2p_csv, \
                open(os.path.join(g2p_path, 'g2p_summary.csv'), 'wb') as g2p_summary_csv:

            sam_g2p(pssm=pssm,
                    remap_csv=remap_csv,
                    nuc_csv=nuc_csv,
                    g2p_csv=g2p_csv,
                    g2p_summary_csv=g2p_summary_csv,
                    min_count=DEFAULT_MIN_COUNT)
Beispiel #36
0
def process_sample(sample_index, run_info, data_path, pssm):
    """ Process a single sample.

    :param sample_index: which sample to process from the session JSON
    :param run_info: run parameters loaded from the session JSON
    :param str data_path: the root folder for all BaseSpace data
    :param pssm: the pssm library for running G2P analysis
    """
    scratch_path = os.path.join(data_path, 'scratch')
    sample_info = run_info.samples[sample_index]
    sample_id = sample_info['Id']
    sample_name = sample_info['Name']
    sample_dir = os.path.join(data_path, 'input', 'samples', sample_id, 'Data',
                              'Intensities', 'BaseCalls')
    if not os.path.exists(sample_dir):
        sample_dir = os.path.join(data_path, 'input', 'samples', sample_id)
    sample_path = None
    for root, _dirs, files in os.walk(sample_dir):
        sample_paths = fnmatch.filter(files, '*_R1_*')
        if sample_paths:
            sample_path = os.path.join(root, sample_paths[0])
            break
    if sample_path is None:
        raise RuntimeError(
            'No R1 file found for sample id {}.'.format(sample_id))
    sample_path2 = sample_path.replace('_R1_', '_R2_')
    if not os.path.exists(sample_path2):
        raise RuntimeError('R2 file missing for sample id {}: {!r}.'.format(
            sample_id, sample_path2))
    logger.info('Processing sample %s (%d of %d): %s (%s).', sample_id,
                sample_index + 1, len(run_info.samples), sample_name,
                sample_path)

    sample_out_path = create_app_result(data_path,
                                        run_info,
                                        sample_info,
                                        description='Mapping results',
                                        suffix='_QC')

    sample_scratch_path = os.path.join(scratch_path, sample_name)
    makedirs(sample_scratch_path)

    censored_path1 = os.path.join(sample_scratch_path, 'censored1.fastq')
    read_summary_path1 = os.path.join(sample_scratch_path, 'read1_summary.csv')
    censor_sample(sample_path, os.path.join(scratch_path, 'bad_cycles.csv'),
                  censored_path1, read_summary_path1)
    censored_path2 = os.path.join(sample_scratch_path, 'censored2.fastq')
    read_summary_path2 = os.path.join(sample_scratch_path, 'read2_summary.csv')
    censor_sample(sample_path2, os.path.join(scratch_path, 'bad_cycles.csv'),
                  censored_path2, read_summary_path2)

    logger.info('Running prelim_map (%d of %d).', sample_index + 1,
                len(run_info.samples))
    with open(os.path.join(sample_scratch_path, 'prelim.csv'),
              'wb') as prelim_csv:
        prelim_map(censored_path1, censored_path2, prelim_csv)

    logger.info('Running remap (%d of %d).', sample_index + 1,
                len(run_info.samples))
    with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'rU') as prelim_csv, \
            open(os.path.join(sample_scratch_path, 'remap.csv'), 'wb') as remap_csv, \
            open(os.path.join(sample_out_path, 'remap_counts.csv'), 'wb') as counts_csv, \
            open(os.path.join(sample_out_path, 'remap_conseq.csv'), 'wb') as conseq_csv, \
            open(os.path.join(sample_out_path, 'unmapped1.fastq'), 'w') as unmapped1, \
            open(os.path.join(sample_out_path, 'unmapped2.fastq'), 'w') as unmapped2:

        remap(censored_path1,
              censored_path2,
              prelim_csv,
              remap_csv,
              counts_csv,
              conseq_csv,
              unmapped1,
              unmapped2,
              sample_scratch_path,
              nthreads=1)

    logger.info('Running sam2aln (%d of %d).', sample_index + 1,
                len(run_info.samples))
    with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \
            open(os.path.join(sample_scratch_path, 'aligned.csv'), 'wb') as aligned_csv, \
            open(os.path.join(sample_out_path, 'conseq_ins.csv'), 'wb') as insert_csv, \
            open(os.path.join(sample_out_path, 'failed_read.csv'), 'wb') as failed_csv:

        sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv)

    logger.info('Running aln2counts (%d of %d).', sample_index + 1,
                len(run_info.samples))
    with open(os.path.join(sample_scratch_path, 'aligned.csv'), 'rU') as aligned_csv, \
            open(os.path.join(sample_out_path, 'nuc.csv'), 'wb') as nuc_csv, \
            open(os.path.join(sample_out_path, 'amino.csv'), 'wb') as amino_csv, \
            open(os.path.join(sample_out_path, 'coord_ins.csv'), 'wb') as coord_ins_csv, \
            open(os.path.join(sample_out_path, 'conseq.csv'), 'wb') as conseq_csv, \
            open(os.path.join(sample_out_path, 'failed_align.csv'), 'wb') as failed_align_csv, \
            open(os.path.join(sample_out_path, 'nuc_variants.csv'), 'wb') as nuc_variants_csv, \
            open(os.path.join(sample_scratch_path, 'coverage_summary.csv'), 'wb') as coverage_summary_csv:

        aln2counts(aligned_csv,
                   nuc_csv,
                   amino_csv,
                   coord_ins_csv,
                   conseq_csv,
                   failed_align_csv,
                   nuc_variants_csv,
                   coverage_summary_csv=coverage_summary_csv)

    logger.info('Running coverage_plots (%d of %d).', sample_index + 1,
                len(run_info.samples))
    coverage_path = os.path.join(sample_out_path, 'coverage')
    with open(os.path.join(sample_out_path, 'amino.csv'), 'rU') as amino_csv, \
            open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'w') as coverage_scores_csv:
        coverage_plot(amino_csv,
                      coverage_scores_csv,
                      path_prefix=coverage_path)

    with open(os.path.join(sample_out_path, 'coverage_scores.csv'),
              'rU') as coverage_scores_csv:
        reader = csv.DictReader(coverage_scores_csv)
        is_v3loop_good = False
        for row in reader:
            if row['region'] == 'V3LOOP':
                is_v3loop_good = row['on.score'] == '4'
                break

    if is_v3loop_good:
        logger.info('Running sam_g2p (%d of %d).', sample_index + 1,
                    len(run_info.samples))
        g2p_path = create_app_result(data_path,
                                     run_info,
                                     sample_info,
                                     description='Geno To Pheno results',
                                     suffix='_G2P')
        with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \
                open(os.path.join(sample_out_path, 'nuc.csv'), 'rU') as nuc_csv, \
                open(os.path.join(g2p_path, 'g2p.csv'), 'wb') as g2p_csv, \
                open(os.path.join(g2p_path, 'g2p_summary.csv'), 'wb') as g2p_summary_csv:

            sam_g2p(pssm=pssm,
                    remap_csv=remap_csv,
                    nuc_csv=nuc_csv,
                    g2p_csv=g2p_csv,
                    g2p_summary_csv=g2p_summary_csv,
                    min_count=DEFAULT_MIN_COUNT)