def test_rmdup_bug(self): test_case_data = [ "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t83\t20\t6181935\t60\t5S96M\t=\t6181919\t-112\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t?<?AADADBFBF<EHIGHGGGEAF3AF<CHGGDG9?GHFFACDHH)?@AHEHHIIIIE>A=A:?);B27@;@?>,;;C(5::>>>@5:()4>@@@######\tXC:i:96\tXT:A:U\tNM:i:1\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:1\tXO:i:0\tXG:i:0\tMD:Z:13G82\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:######@@@>4)(:5@>>>::5(C;;,>?@;@72B;)?:A=A>EIIIIHHEHA@?)HHDCAFFHG?9GDGGHC<FA3FAEGGGHGIHE<FBFBDADAA?<?", "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181935\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?", "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t83\t20\t6181938\t60\t8S93M\t=\t6181919\t-112\tAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACCCCTCTACTGTTACCAAATCCACTCAGATTCCCAAATCTTCCACTT\t@@@DDFDFHHHHHJJJEHGGHIHHAEGHJJIJJFGGHGIDIGIJJ?BBGGGIIIJJIJGFHGIJEC(=3?C;?B9?@C>CECECAA(;;3>C#########\tXC:i:93\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:10G36C45\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:#########C>3;;(AACECEC>C@?9B?;C?3=(CEJIGHFGJIJJIIIGGGBB?JJIGIDIGHGGFJJIJJHGEAHHIHGGHEJJJHHHHHFDFDD@@@", "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181938\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA", ] sams = map(SAMMapping, test_case_data) pair1 = sams[0:2] pair2 = sams[2:] self.link.process(pair1) self.assertEqual(2, len(self.ctx.emitted.keys())) key_list = list(sorted(self.ctx.emitted.keys())) self.assertEqual("0020:000006181919:F", key_list[0]) self.assertEqual("0020:000006182030:R", key_list[1]) self.link.process(pair2) self.assertEqual(2, len(self.ctx.emitted.keys())) for k, value_list in self.ctx.emitted.iteritems(): self.assertEqual(2, len( value_list)) # each key should have two pairs at its position value_list = self.ctx.emitted["0020:000006181919:F"] for value in value_list: unserialized = proto.unserialize_pair(value) self.assertTrue(unserialized[0].pos < unserialized[1].pos) value_list = self.ctx.emitted["0020:000006182030:R"] for value in value_list: self.assertEqual(PAIR_STRING, value)
def test_rmdup_bug(self): test_case_data = [ "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t83\t20\t6181935\t60\t5S96M\t=\t6181919\t-112\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t?<?AADADBFBF<EHIGHGGGEAF3AF<CHGGDG9?GHFFACDHH)?@AHEHHIIIIE>A=A:?);B27@;@?>,;;C(5::>>>@5:()4>@@@######\tXC:i:96\tXT:A:U\tNM:i:1\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:1\tXO:i:0\tXG:i:0\tMD:Z:13G82\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:######@@@>4)(:5@>>>::5(C;;,>?@;@72B;)?:A=A>EIIIIHHEHA@?)HHDCAFFHG?9GDGGHC<FA3FAEGGGHGIHE<FBFBDADAA?<?", "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181935\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?", "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t83\t20\t6181938\t60\t8S93M\t=\t6181919\t-112\tAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACCCCTCTACTGTTACCAAATCCACTCAGATTCCCAAATCTTCCACTT\t@@@DDFDFHHHHHJJJEHGGHIHHAEGHJJIJJFGGHGIDIGIJJ?BBGGGIIIJJIJGFHGIJEC(=3?C;?B9?@C>CECECAA(;;3>C#########\tXC:i:93\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:10G36C45\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:#########C>3;;(AACECEC>C@?9B?;C?3=(CEJIGHFGJIJJIIIGGGBB?JJIGIDIGHGGFJJIJJHGEAHHIHGGHEJJJHHHHHFDFDD@@@", "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181938\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA", ] sams = map(SAMMapping, test_case_data) pair1 = sams[0:2] pair2 = sams[2:] self.link.process(pair1) self.assertEqual(2, len(self.ctx.emitted.keys())) key_list = list(sorted(self.ctx.emitted.keys())) self.assertEqual("0020:000006181919:F", key_list[0]) self.assertEqual("0020:000006182030:R", key_list[1]) self.link.process(pair2) self.assertEqual(2, len(self.ctx.emitted.keys())) for k,value_list in self.ctx.emitted.iteritems(): self.assertEqual(2, len(value_list)) # each key should have two pairs at its position value_list = self.ctx.emitted["0020:000006181919:F"] for value in value_list: unserialized = proto.unserialize_pair(value) self.assertTrue(unserialized[0].pos < unserialized[1].pos) value_list = self.ctx.emitted["0020:000006182030:R"] for value in value_list: self.assertEqual(PAIR_STRING, value)
def test_fw_rev_with_indels(self): """ Here we have two duplicate pairs. Read 1 in both pairs is positioned at the same location. For both pairs, read 2 is mapped on the reverse strand. The second one is mapped with an insertion, so its 5' location is shifted by one. Yet, the read end is in the same location. """ test_case_data = [ "HWI-ST200R_251:7:2207:3236:93050#CGATGT\t99\t7\t15609040\t60\t101M\t=\t15609197\t257\tCTAGCTTGTAACAATTGCTATAACTCCCCCACTTTGGATGGTAAATTTCTCCTCAGCTGTCATTGGCCCTCAAAGCCAAAATGACTCCAATTAGAATGTAT\tCCCFFFFFHHHHHJJJJJJJJJIIJJJJJJJIJJJJJJJJJFIJJJJJJJJJJJIJJIJGIGIJJJJJJJJJ>EHHEFFFFFEEDEDEDDCDDDACCA:CD", "HWI-ST200R_251:7:2207:3236:93050#CGATGT\t147\t7\t15609197\t60\t37M1I63M\t=\t15609040\t-257\tCTCCATTACAGCAGAGGAAAGAAACTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTAAAGAAACTGGGTTGAAGAAGTAGTTCATTGAATGGTTGTCTTAC\t################CAC@3DA:))&BDDDDDDB<&BDDDDDDDDHIJJIJJIJJJJJIIHHJJJJJJJJJJJJJJIJJJJJJJJJJHHHHHFFFFFBBC", "HWI-ST200R_251:1:1101:10006:13364#CGATGT\t99\t7\t15609040\t60\t101M\t=\t15609196\t257\tCTAGCTTGTAACAATTGCTATAACTCCCCCACTTTGGATGGTAAATTTCTCCTCAGCTGTCATTGGCCCTCAAAGCCAAAATGACTCCAATTAGAATGTAT\tCCCFFFFFHHHHHJJJJJJJJJJJGIJJJJIJJJJJJJJJIHIIJJJJJJJJJJJJIIJGGGIIIJDHIGIHFGFEHEF>??>@CDEECC@CCC(>;A:>5", "HWI-ST200R_251:1:1101:10006:13364#CGATGT\t147\t7\t15609196\t60\t101M\t=\t15609040\t-257\tTACCATTTAAAGCAGAGGAAAAAAACTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAAGAAACTGGGTTGAAGAAGTAGTTCATTGAATGGTTGTCTTAC\t############################BBDDDDB803DDDDDDDDHJJJIIJJJJJJJIHF?JJJJJJJJJIIJJJJJJJJJJJJJJHHHHHFFFFFB@C", ] sams = map(SAMMapping, test_case_data) pair1 = sams[0:2] pair2 = sams[2:] self.link.process(pair1) self.link.process(pair2) self.assertEqual(2, len(self.ctx.emitted.keys())) key_list = list(sorted(self.ctx.emitted.keys())) self.assertEqual("0007:000015609040:F", key_list[0]) self.assertEqual("0007:000015609296:R", key_list[1]) for k, value_list in self.ctx.emitted.iteritems(): self.assertEqual(2, len( value_list)) # each key should have two pairs at its position value_list = self.ctx.emitted["0007:000015609040:F"] for value in value_list: unserialized = proto.unserialize_pair(value) self.assertTrue(unserialized[0].pos < unserialized[1].pos) value_list = self.ctx.emitted["0007:000015609296:R"] for value in value_list: self.assertEqual(PAIR_STRING, value)
def test_fw_rev_with_indels(self): """ Here we have two duplicate pairs. Read 1 in both pairs is positioned at the same location. For both pairs, read 2 is mapped on the reverse strand. The second one is mapped with an insertion, so its 5' location is shifted by one. Yet, the read end is in the same location. """ test_case_data = [ "HWI-ST200R_251:7:2207:3236:93050#CGATGT\t99\t7\t15609040\t60\t101M\t=\t15609197\t257\tCTAGCTTGTAACAATTGCTATAACTCCCCCACTTTGGATGGTAAATTTCTCCTCAGCTGTCATTGGCCCTCAAAGCCAAAATGACTCCAATTAGAATGTAT\tCCCFFFFFHHHHHJJJJJJJJJIIJJJJJJJIJJJJJJJJJFIJJJJJJJJJJJIJJIJGIGIJJJJJJJJJ>EHHEFFFFFEEDEDEDDCDDDACCA:CD", "HWI-ST200R_251:7:2207:3236:93050#CGATGT\t147\t7\t15609197\t60\t37M1I63M\t=\t15609040\t-257\tCTCCATTACAGCAGAGGAAAGAAACTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTAAAGAAACTGGGTTGAAGAAGTAGTTCATTGAATGGTTGTCTTAC\t################CAC@3DA:))&BDDDDDDB<&BDDDDDDDDHIJJIJJIJJJJJIIHHJJJJJJJJJJJJJJIJJJJJJJJJJHHHHHFFFFFBBC", "HWI-ST200R_251:1:1101:10006:13364#CGATGT\t99\t7\t15609040\t60\t101M\t=\t15609196\t257\tCTAGCTTGTAACAATTGCTATAACTCCCCCACTTTGGATGGTAAATTTCTCCTCAGCTGTCATTGGCCCTCAAAGCCAAAATGACTCCAATTAGAATGTAT\tCCCFFFFFHHHHHJJJJJJJJJJJGIJJJJIJJJJJJJJJIHIIJJJJJJJJJJJJIIJGGGIIIJDHIGIHFGFEHEF>??>@CDEECC@CCC(>;A:>5", "HWI-ST200R_251:1:1101:10006:13364#CGATGT\t147\t7\t15609196\t60\t101M\t=\t15609040\t-257\tTACCATTTAAAGCAGAGGAAAAAAACTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAAGAAACTGGGTTGAAGAAGTAGTTCATTGAATGGTTGTCTTAC\t############################BBDDDDB803DDDDDDDDHJJJIIJJJJJJJIHF?JJJJJJJJJIIJJJJJJJJJJJJJJHHHHHFFFFFB@C", ] sams = map(SAMMapping, test_case_data) pair1 = sams[0:2] pair2 = sams[2:] self.link.process(pair1) self.link.process(pair2) self.assertEqual(2, len(self.ctx.emitted.keys())) key_list = list(sorted(self.ctx.emitted.keys())) self.assertEqual("0007:000015609040:F", key_list[0]) self.assertEqual("0007:000015609296:R", key_list[1]) for k,value_list in self.ctx.emitted.iteritems(): self.assertEqual(2, len(value_list)) # each key should have two pairs at its position value_list = self.ctx.emitted["0007:000015609040:F"] for value in value_list: unserialized = proto.unserialize_pair(value) self.assertTrue(unserialized[0].pos < unserialized[1].pos) value_list = self.ctx.emitted["0007:000015609296:R"] for value in value_list: self.assertEqual(PAIR_STRING, value)
def reduce(self, ctx): # create the "workspace" self.__pairs = [] self.__unpaired = [] # gather input key_values = ctx.getInputKey().split(':') if key_values[0] == seqal_app.UNMAPPED_STRING: # pair of unmapped sequences self.__process_unmapped_pairs(ctx) else: if len(key_values) != 3: raise RuntimeError( "Unexpected key length %d. Expected key format is ref_id:pos:orient" % len(key)) # convert key values and make it a tuple key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R' ) # last value is True if reverse strand have_pairs = False # keep track of whether we have at least one real pair. # load mappings while ctx.nextValue(): value = ctx.getInputValue() if value == seqal_app.PAIR_STRING: have_pairs = True else: pair = protobuf_mapping.unserialize_pair(value) if pair[0] is None or pair[0].is_unmapped(): # Sanity check. pair[0] should never be None or unmapped here. raise ValueError( "Error! Got None or unmapped in first read for key %s. pair: %s" % (key, pair)) if pair[1] and pair[1].is_unmapped(): self.__output_sink.process((pair[1], None)) self.__unpaired.append((pair[0], None)) elif pair[1] is None: self.__unpaired.append(pair) else: # Two mapped reads. # pair[0] should never be unmapped. That case should be handled by # __process_unmapped_pairs. self.__pairs.append(pair) have_pairs = True self.__process_pairs() self.__process_fragments(have_pairs) # clean-up the workspace self.__pairs = None self.__unpaired = None
def test_emit_reverse_fragment1(self): # None in pair[0]. Fragment in pair[1]. self.pair1 = test_utils.erase_read1(list(self.pair1)) self.pair1[1].set_on_reverse(True) self.link.process(self.pair1) self.assertEqual(1, len(self.ctx.emitted.keys())) expected_key = test_utils.make_key(self.pair1[1]) self.assertEqual(1, len(self.ctx.emitted[expected_key])) unserialized = proto.unserialize_pair(self.ctx.emitted[expected_key][0]) self.assertTrue(unserialized[1] is None) self.assertEqual(self.pair1[1].tid, unserialized[0].tid) self.assertEqual(self.pair1[1].pos, unserialized[0].pos) self.assertTrue(unserialized[0].is_on_reverse())
def test_emit_forward_fragment2(self): # Fragment in pair[0]. None in pair[1] self.pair1 = test_utils.erase_read2(list(self.pair1)) self.link.process(self.pair1) self.assertEqual(1, len(self.ctx.emitted.keys())) expected_key = test_utils.make_key(self.pair1[0]) self.assertEqual(1, len(self.ctx.emitted[expected_key])) unserialized = proto.unserialize_pair(self.ctx.emitted[expected_key][0]) self.assertTrue(unserialized[1] is None) self.assertEqual(self.pair1[0].tid, unserialized[0].tid) self.assertEqual(self.pair1[0].pos, unserialized[0].pos) self.assertTrue(self.ctx.counters.has_key("Test:MAPPED COORDINATES")) self.assertEqual(1, self.ctx.counters["Test:MAPPED COORDINATES"])
def test_emit_reverse_fragment1(self): # None in pair[0]. Fragment in pair[1]. self.pair1 = test_utils.erase_read1(list(self.pair1)) self.pair1[1].set_on_reverse(True) self.link.process(self.pair1) self.assertEqual(1, len(self.ctx.emitted.keys())) expected_key = test_utils.make_key(self.pair1[1]) self.assertEqual(1, len(self.ctx.emitted[expected_key])) unserialized = proto.unserialize_pair( self.ctx.emitted[expected_key][0]) self.assertTrue(unserialized[1] is None) self.assertEqual(self.pair1[1].tid, unserialized[0].tid) self.assertEqual(self.pair1[1].pos, unserialized[0].pos) self.assertTrue(unserialized[0].is_on_reverse())
def test_emit_forward_fragment2(self): # Fragment in pair[0]. None in pair[1] self.pair1 = test_utils.erase_read2(list(self.pair1)) self.link.process(self.pair1) self.assertEqual(1, len(self.ctx.emitted.keys())) expected_key = test_utils.make_key(self.pair1[0]) self.assertEqual(1, len(self.ctx.emitted[expected_key])) unserialized = proto.unserialize_pair( self.ctx.emitted[expected_key][0]) self.assertTrue(unserialized[1] is None) self.assertEqual(self.pair1[0].tid, unserialized[0].tid) self.assertEqual(self.pair1[0].pos, unserialized[0].pos) self.assertTrue(self.ctx.counters.has_key("Test:MAPPED COORDINATES")) self.assertEqual(1, self.ctx.counters["Test:MAPPED COORDINATES"])
def test_unmapped1(self): self.pair1[0].set_mapped(False) self.pair1[1].set_mate_mapped(False) self.link.process(self.pair1) self.assertEqual(1, len(self.ctx.emitted.keys())) self.assertTrue( test_utils.make_key(self.pair1[1]) in self.ctx.emitted.keys() ) self.assertEqual(1, len(self.ctx.emitted.values()[0])) unserialized = proto.unserialize_pair(self.ctx.emitted.values()[0][0]) self.assertFalse(unserialized[0] is None) self.assertFalse(unserialized[1] is None) self.assertEqual(self.pair1[1].tid, unserialized[0].tid) self.assertEqual(self.pair1[1].pos, unserialized[0].pos) self.assertEqual(1, self.ctx.counters["Test:UNMAPPED READS"])
def test_unmapped1(self): self.pair1[0].set_mapped(False) self.pair1[1].set_mate_mapped(False) self.link.process(self.pair1) self.assertEqual(1, len(self.ctx.emitted.keys())) self.assertTrue( test_utils.make_key(self.pair1[1]) in self.ctx.emitted.keys()) self.assertEqual(1, len(self.ctx.emitted.values()[0])) unserialized = proto.unserialize_pair(self.ctx.emitted.values()[0][0]) self.assertFalse(unserialized[0] is None) self.assertFalse(unserialized[1] is None) self.assertEqual(self.pair1[1].tid, unserialized[0].tid) self.assertEqual(self.pair1[1].pos, unserialized[0].pos) self.assertEqual(1, self.ctx.counters["Test:UNMAPPED READS"])
def reduce(self, ctx): # create the "workspace" self.__pairs = [] self.__unpaired = [] # gather input key_values = ctx.getInputKey().split(':') if key_values[0] == seqal_app.UNMAPPED_STRING: # pair of unmapped sequences self.__process_unmapped_pairs(ctx) else: if len(key_values) != 3: raise RuntimeError("Unexpected key length %d. Expected key format is ref_id:pos:orient" % len(key)) # convert key values and make it a tuple key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R') # last value is True if reverse strand have_pairs = False # keep track of whether we have at least one real pair. # load mappings while ctx.nextValue(): value = ctx.getInputValue() if value == seqal_app.PAIR_STRING: have_pairs = True else: pair = protobuf_mapping.unserialize_pair(value) if pair[0] is None or pair[0].is_unmapped(): # Sanity check. pair[0] should never be None or unmapped here. raise ValueError("Error! Got None or unmapped in first read for key %s. pair: %s" % (key, pair)) if pair[1] and pair[1].is_unmapped(): self.__output_sink.process( (pair[1], None) ) self.__unpaired.append( (pair[0], None) ) elif pair[1] is None: self.__unpaired.append(pair) else: # Two mapped reads. # pair[0] should never be unmapped. That case should be handled by # __process_unmapped_pairs. self.__pairs.append(pair) have_pairs = True self.__process_pairs() self.__process_fragments(have_pairs) # clean-up the workspace self.__pairs = None self.__unpaired = None
def test_emit_forward_pair(self): # We expect to get the pair emitted with the key generated from # read 1 (the left read). On the other hand, we expect to get # PAIR_STRING with the key generated from read 2 self.link.process(self.pair1) expected_keys = map(test_utils.make_key, self.pair1) for i in 0,1: self.assertTrue( self.ctx.emitted.has_key(expected_keys[i]) ) self.assertEqual(1, len(self.ctx.emitted[expected_keys[i]])) unserialized = proto.unserialize_pair(self.ctx.emitted[expected_keys[0]][0]) for j in 0,1: self.assertEqual(self.pair1[j].tid, unserialized[j].tid) self.assertEqual(self.pair1[j].pos, unserialized[j].pos) second_value = self.ctx.emitted[expected_keys[1]][0] self.assertEqual(PAIR_STRING, second_value)
def test_emit_forward_pair(self): # We expect to get the pair emitted with the key generated from # read 1 (the left read). On the other hand, we expect to get # PAIR_STRING with the key generated from read 2 self.link.process(self.pair1) expected_keys = map(test_utils.make_key, self.pair1) for i in 0, 1: self.assertTrue(self.ctx.emitted.has_key(expected_keys[i])) self.assertEqual(1, len(self.ctx.emitted[expected_keys[i]])) unserialized = proto.unserialize_pair( self.ctx.emitted[expected_keys[0]][0]) for j in 0, 1: self.assertEqual(self.pair1[j].tid, unserialized[j].tid) self.assertEqual(self.pair1[j].pos, unserialized[j].pos) second_value = self.ctx.emitted[expected_keys[1]][0] self.assertEqual(PAIR_STRING, second_value)
def test_emit_backward_pair(self): # Similar to test_emit_forward_pair, but here we expect to have the reads # reordered. So, # key2 => reversed and serialized pair # key1 => PAIR_STRING self.link.process(self.pair2) expected_keys = map(test_utils.make_key, self.pair2) for i in 0,1: self.assertTrue( self.ctx.emitted.has_key(expected_keys[i]) ) self.assertEqual(1, len(self.ctx.emitted[expected_keys[i]])) unserialized = proto.unserialize_pair(self.ctx.emitted[expected_keys[1]][0]) for j in 0,1: self.assertEqual(self.pair2[j].tid, unserialized[j^1].tid) self.assertEqual(self.pair2[j].pos, unserialized[j^1].pos) second_value = self.ctx.emitted[expected_keys[0]][0] self.assertEqual(PAIR_STRING, second_value)
def test_emit_backward_pair(self): # Similar to test_emit_forward_pair, but here we expect to have the reads # reordered. So, # key2 => reversed and serialized pair # key1 => PAIR_STRING self.link.process(self.pair2) expected_keys = map(test_utils.make_key, self.pair2) for i in 0, 1: self.assertTrue(self.ctx.emitted.has_key(expected_keys[i])) self.assertEqual(1, len(self.ctx.emitted[expected_keys[i]])) unserialized = proto.unserialize_pair( self.ctx.emitted[expected_keys[1]][0]) for j in 0, 1: self.assertEqual(self.pair2[j].tid, unserialized[j ^ 1].tid) self.assertEqual(self.pair2[j].pos, unserialized[j ^ 1].pos) second_value = self.ctx.emitted[expected_keys[0]][0] self.assertEqual(PAIR_STRING, second_value)
def __pipe_pair_through(self, pair): message = io.serialize_pair(pair) return io.unserialize_pair(message)
def __process_unmapped_pairs(self, ctx): while ctx.nextValue(): value = ctx.getInputValue() pair = protobuf_mapping.unserialize_pair(value) self.__output_sink.process(pair)