Ejemplo n.º 1
0
    def test_rmdup_bug(self):
        test_case_data = [
            "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t83\t20\t6181935\t60\t5S96M\t=\t6181919\t-112\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t?<?AADADBFBF<EHIGHGGGEAF3AF<CHGGDG9?GHFFACDHH)?@AHEHHIIIIE>A=A:?);B27@;@?>,;;C(5::>>>@5:()4>@@@######\tXC:i:96\tXT:A:U\tNM:i:1\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:1\tXO:i:0\tXG:i:0\tMD:Z:13G82\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:######@@@>4)(:5@>>>::5(C;;,>?@;@72B;)?:A=A>EIIIIHHEHA@?)HHDCAFFHG?9GDGGHC<FA3FAEGGGHGIHE<FBFBDADAA?<?",
            "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181935\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?",
            "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t83\t20\t6181938\t60\t8S93M\t=\t6181919\t-112\tAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACCCCTCTACTGTTACCAAATCCACTCAGATTCCCAAATCTTCCACTT\t@@@DDFDFHHHHHJJJEHGGHIHHAEGHJJIJJFGGHGIDIGIJJ?BBGGGIIIJJIJGFHGIJEC(=3?C;?B9?@C>CECECAA(;;3>C#########\tXC:i:93\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:10G36C45\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:#########C>3;;(AACECEC>C@?9B?;C?3=(CEJIGHFGJIJJIIIGGGBB?JJIGIDIGHGGFJJIJJHGEAHHIHGGHEJJJHHHHHFDFDD@@@",
            "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181938\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA",
        ]
        sams = map(SAMMapping, test_case_data)
        pair1 = sams[0:2]
        pair2 = sams[2:]
        self.link.process(pair1)

        self.assertEqual(2, len(self.ctx.emitted.keys()))
        key_list = list(sorted(self.ctx.emitted.keys()))
        self.assertEqual("0020:000006181919:F", key_list[0])
        self.assertEqual("0020:000006182030:R", key_list[1])

        self.link.process(pair2)

        self.assertEqual(2, len(self.ctx.emitted.keys()))
        for k, value_list in self.ctx.emitted.iteritems():
            self.assertEqual(2, len(
                value_list))  # each key should have two pairs at its position

        value_list = self.ctx.emitted["0020:000006181919:F"]
        for value in value_list:
            unserialized = proto.unserialize_pair(value)
            self.assertTrue(unserialized[0].pos < unserialized[1].pos)
        value_list = self.ctx.emitted["0020:000006182030:R"]
        for value in value_list:
            self.assertEqual(PAIR_STRING, value)
	def test_rmdup_bug(self):
		test_case_data = [
"HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t83\t20\t6181935\t60\t5S96M\t=\t6181919\t-112\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t?<?AADADBFBF<EHIGHGGGEAF3AF<CHGGDG9?GHFFACDHH)?@AHEHHIIIIE>A=A:?);B27@;@?>,;;C(5::>>>@5:()4>@@@######\tXC:i:96\tXT:A:U\tNM:i:1\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:1\tXO:i:0\tXG:i:0\tMD:Z:13G82\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:######@@@>4)(:5@>>>::5(C;;,>?@;@72B;)?:A=A>EIIIIHHEHA@?)HHDCAFFHG?9GDGGHC<FA3FAEGGGHGIHE<FBFBDADAA?<?",
"HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181935\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?",
"HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t83\t20\t6181938\t60\t8S93M\t=\t6181919\t-112\tAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACCCCTCTACTGTTACCAAATCCACTCAGATTCCCAAATCTTCCACTT\t@@@DDFDFHHHHHJJJEHGGHIHHAEGHJJIJJFGGHGIDIGIJJ?BBGGGIIIJJIJGFHGIJEC(=3?C;?B9?@C>CECECAA(;;3>C#########\tXC:i:93\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:10G36C45\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:#########C>3;;(AACECEC>C@?9B?;C?3=(CEJIGHFGJIJJIIIGGGBB?JJIGIDIGHGGFJJIJJHGEAHHIHGGHEJJJHHHHHFDFDD@@@",
"HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181938\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA",
		]
		sams = map(SAMMapping, test_case_data)
		pair1 = sams[0:2]
		pair2 = sams[2:]
		self.link.process(pair1)

		self.assertEqual(2, len(self.ctx.emitted.keys()))
		key_list = list(sorted(self.ctx.emitted.keys()))
		self.assertEqual("0020:000006181919:F", key_list[0])
		self.assertEqual("0020:000006182030:R", key_list[1])

		self.link.process(pair2)

		self.assertEqual(2, len(self.ctx.emitted.keys()))
		for k,value_list in self.ctx.emitted.iteritems():
			self.assertEqual(2, len(value_list)) # each key should have two pairs at its position

		value_list = self.ctx.emitted["0020:000006181919:F"]
		for value in value_list:
			unserialized = proto.unserialize_pair(value)
			self.assertTrue(unserialized[0].pos < unserialized[1].pos)
		value_list = self.ctx.emitted["0020:000006182030:R"]
		for value in value_list:
			self.assertEqual(PAIR_STRING, value)
Ejemplo n.º 3
0
    def test_fw_rev_with_indels(self):
        """
        Here we have two duplicate pairs.  Read 1 in both pairs is positioned at the same location.
        For both pairs, read 2 is mapped on the reverse strand.  The second one is mapped with an
        insertion, so its 5' location is shifted by one.  Yet, the read end is in the same location.
        """
        test_case_data = [
            "HWI-ST200R_251:7:2207:3236:93050#CGATGT\t99\t7\t15609040\t60\t101M\t=\t15609197\t257\tCTAGCTTGTAACAATTGCTATAACTCCCCCACTTTGGATGGTAAATTTCTCCTCAGCTGTCATTGGCCCTCAAAGCCAAAATGACTCCAATTAGAATGTAT\tCCCFFFFFHHHHHJJJJJJJJJIIJJJJJJJIJJJJJJJJJFIJJJJJJJJJJJIJJIJGIGIJJJJJJJJJ>EHHEFFFFFEEDEDEDDCDDDACCA:CD",
            "HWI-ST200R_251:7:2207:3236:93050#CGATGT\t147\t7\t15609197\t60\t37M1I63M\t=\t15609040\t-257\tCTCCATTACAGCAGAGGAAAGAAACTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTAAAGAAACTGGGTTGAAGAAGTAGTTCATTGAATGGTTGTCTTAC\t################CAC@3DA:))&BDDDDDDB<&BDDDDDDDDHIJJIJJIJJJJJIIHHJJJJJJJJJJJJJJIJJJJJJJJJJHHHHHFFFFFBBC",
            "HWI-ST200R_251:1:1101:10006:13364#CGATGT\t99\t7\t15609040\t60\t101M\t=\t15609196\t257\tCTAGCTTGTAACAATTGCTATAACTCCCCCACTTTGGATGGTAAATTTCTCCTCAGCTGTCATTGGCCCTCAAAGCCAAAATGACTCCAATTAGAATGTAT\tCCCFFFFFHHHHHJJJJJJJJJJJGIJJJJIJJJJJJJJJIHIIJJJJJJJJJJJJIIJGGGIIIJDHIGIHFGFEHEF>??>@CDEECC@CCC(>;A:>5",
            "HWI-ST200R_251:1:1101:10006:13364#CGATGT\t147\t7\t15609196\t60\t101M\t=\t15609040\t-257\tTACCATTTAAAGCAGAGGAAAAAAACTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAAGAAACTGGGTTGAAGAAGTAGTTCATTGAATGGTTGTCTTAC\t############################BBDDDDB803DDDDDDDDHJJJIIJJJJJJJIHF?JJJJJJJJJIIJJJJJJJJJJJJJJHHHHHFFFFFB@C",
        ]

        sams = map(SAMMapping, test_case_data)
        pair1 = sams[0:2]
        pair2 = sams[2:]
        self.link.process(pair1)
        self.link.process(pair2)

        self.assertEqual(2, len(self.ctx.emitted.keys()))
        key_list = list(sorted(self.ctx.emitted.keys()))
        self.assertEqual("0007:000015609040:F", key_list[0])
        self.assertEqual("0007:000015609296:R", key_list[1])

        for k, value_list in self.ctx.emitted.iteritems():
            self.assertEqual(2, len(
                value_list))  # each key should have two pairs at its position

        value_list = self.ctx.emitted["0007:000015609040:F"]
        for value in value_list:
            unserialized = proto.unserialize_pair(value)
            self.assertTrue(unserialized[0].pos < unserialized[1].pos)
        value_list = self.ctx.emitted["0007:000015609296:R"]
        for value in value_list:
            self.assertEqual(PAIR_STRING, value)
	def test_fw_rev_with_indels(self):
		"""
		Here we have two duplicate pairs.  Read 1 in both pairs is positioned at the same location.
		For both pairs, read 2 is mapped on the reverse strand.  The second one is mapped with an
		insertion, so its 5' location is shifted by one.  Yet, the read end is in the same location.
		"""
		test_case_data = [
"HWI-ST200R_251:7:2207:3236:93050#CGATGT\t99\t7\t15609040\t60\t101M\t=\t15609197\t257\tCTAGCTTGTAACAATTGCTATAACTCCCCCACTTTGGATGGTAAATTTCTCCTCAGCTGTCATTGGCCCTCAAAGCCAAAATGACTCCAATTAGAATGTAT\tCCCFFFFFHHHHHJJJJJJJJJIIJJJJJJJIJJJJJJJJJFIJJJJJJJJJJJIJJIJGIGIJJJJJJJJJ>EHHEFFFFFEEDEDEDDCDDDACCA:CD",
"HWI-ST200R_251:7:2207:3236:93050#CGATGT\t147\t7\t15609197\t60\t37M1I63M\t=\t15609040\t-257\tCTCCATTACAGCAGAGGAAAGAAACTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTAAAGAAACTGGGTTGAAGAAGTAGTTCATTGAATGGTTGTCTTAC\t################CAC@3DA:))&BDDDDDDB<&BDDDDDDDDHIJJIJJIJJJJJIIHHJJJJJJJJJJJJJJIJJJJJJJJJJHHHHHFFFFFBBC",
"HWI-ST200R_251:1:1101:10006:13364#CGATGT\t99\t7\t15609040\t60\t101M\t=\t15609196\t257\tCTAGCTTGTAACAATTGCTATAACTCCCCCACTTTGGATGGTAAATTTCTCCTCAGCTGTCATTGGCCCTCAAAGCCAAAATGACTCCAATTAGAATGTAT\tCCCFFFFFHHHHHJJJJJJJJJJJGIJJJJIJJJJJJJJJIHIIJJJJJJJJJJJJIIJGGGIIIJDHIGIHFGFEHEF>??>@CDEECC@CCC(>;A:>5",
"HWI-ST200R_251:1:1101:10006:13364#CGATGT\t147\t7\t15609196\t60\t101M\t=\t15609040\t-257\tTACCATTTAAAGCAGAGGAAAAAAACTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAAGAAACTGGGTTGAAGAAGTAGTTCATTGAATGGTTGTCTTAC\t############################BBDDDDB803DDDDDDDDHJJJIIJJJJJJJIHF?JJJJJJJJJIIJJJJJJJJJJJJJJHHHHHFFFFFB@C",
		]

		sams = map(SAMMapping, test_case_data)
		pair1 = sams[0:2]
		pair2 = sams[2:]
		self.link.process(pair1)
		self.link.process(pair2)

		self.assertEqual(2, len(self.ctx.emitted.keys()))
		key_list = list(sorted(self.ctx.emitted.keys()))
		self.assertEqual("0007:000015609040:F", key_list[0])
		self.assertEqual("0007:000015609296:R", key_list[1])

		for k,value_list in self.ctx.emitted.iteritems():
			self.assertEqual(2, len(value_list)) # each key should have two pairs at its position

		value_list = self.ctx.emitted["0007:000015609040:F"]
		for value in value_list:
			unserialized = proto.unserialize_pair(value)
			self.assertTrue(unserialized[0].pos < unserialized[1].pos)
		value_list = self.ctx.emitted["0007:000015609296:R"]
		for value in value_list:
			self.assertEqual(PAIR_STRING, value)
Ejemplo n.º 5
0
    def reduce(self, ctx):
        # create the "workspace"
        self.__pairs = []
        self.__unpaired = []

        # gather input
        key_values = ctx.getInputKey().split(':')

        if key_values[0] == seqal_app.UNMAPPED_STRING:
            # pair of unmapped sequences
            self.__process_unmapped_pairs(ctx)
        else:
            if len(key_values) != 3:
                raise RuntimeError(
                    "Unexpected key length %d.  Expected key format is ref_id:pos:orient"
                    % len(key))
            # convert key values and make it a tuple
            key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R'
                   )  # last value is True if reverse strand

            have_pairs = False  # keep track of whether we have at least one real pair.
            # load mappings
            while ctx.nextValue():
                value = ctx.getInputValue()
                if value == seqal_app.PAIR_STRING:
                    have_pairs = True
                else:
                    pair = protobuf_mapping.unserialize_pair(value)
                    if pair[0] is None or pair[0].is_unmapped():
                        # Sanity check. pair[0] should never be None or unmapped here.
                        raise ValueError(
                            "Error!  Got None or unmapped in first read for key %s.  pair: %s"
                            % (key, pair))

                    if pair[1] and pair[1].is_unmapped():
                        self.__output_sink.process((pair[1], None))
                        self.__unpaired.append((pair[0], None))
                    elif pair[1] is None:
                        self.__unpaired.append(pair)
                    else:
                        # Two mapped reads.
                        # pair[0] should never be unmapped.  That case should be handled by
                        # __process_unmapped_pairs.
                        self.__pairs.append(pair)
                        have_pairs = True

            self.__process_pairs()
            self.__process_fragments(have_pairs)

        # clean-up the workspace
        self.__pairs = None
        self.__unpaired = None
	def test_emit_reverse_fragment1(self):
		# None in pair[0]. Fragment in pair[1].
		self.pair1 = test_utils.erase_read1(list(self.pair1))
		self.pair1[1].set_on_reverse(True)
		self.link.process(self.pair1)
		self.assertEqual(1, len(self.ctx.emitted.keys()))
		expected_key = test_utils.make_key(self.pair1[1])
		self.assertEqual(1, len(self.ctx.emitted[expected_key]))
		unserialized = proto.unserialize_pair(self.ctx.emitted[expected_key][0])
		self.assertTrue(unserialized[1] is None)
		self.assertEqual(self.pair1[1].tid, unserialized[0].tid)
		self.assertEqual(self.pair1[1].pos, unserialized[0].pos)
		self.assertTrue(unserialized[0].is_on_reverse())
	def test_emit_forward_fragment2(self):
		# Fragment in pair[0].  None in pair[1]
		self.pair1 = test_utils.erase_read2(list(self.pair1))
		self.link.process(self.pair1)
		self.assertEqual(1, len(self.ctx.emitted.keys()))
		expected_key = test_utils.make_key(self.pair1[0])
		self.assertEqual(1, len(self.ctx.emitted[expected_key]))
		unserialized = proto.unserialize_pair(self.ctx.emitted[expected_key][0])
		self.assertTrue(unserialized[1] is None)
		self.assertEqual(self.pair1[0].tid, unserialized[0].tid)
		self.assertEqual(self.pair1[0].pos, unserialized[0].pos)
		self.assertTrue(self.ctx.counters.has_key("Test:MAPPED COORDINATES"))
		self.assertEqual(1, self.ctx.counters["Test:MAPPED COORDINATES"])
Ejemplo n.º 8
0
 def test_emit_reverse_fragment1(self):
     # None in pair[0]. Fragment in pair[1].
     self.pair1 = test_utils.erase_read1(list(self.pair1))
     self.pair1[1].set_on_reverse(True)
     self.link.process(self.pair1)
     self.assertEqual(1, len(self.ctx.emitted.keys()))
     expected_key = test_utils.make_key(self.pair1[1])
     self.assertEqual(1, len(self.ctx.emitted[expected_key]))
     unserialized = proto.unserialize_pair(
         self.ctx.emitted[expected_key][0])
     self.assertTrue(unserialized[1] is None)
     self.assertEqual(self.pair1[1].tid, unserialized[0].tid)
     self.assertEqual(self.pair1[1].pos, unserialized[0].pos)
     self.assertTrue(unserialized[0].is_on_reverse())
Ejemplo n.º 9
0
 def test_emit_forward_fragment2(self):
     # Fragment in pair[0].  None in pair[1]
     self.pair1 = test_utils.erase_read2(list(self.pair1))
     self.link.process(self.pair1)
     self.assertEqual(1, len(self.ctx.emitted.keys()))
     expected_key = test_utils.make_key(self.pair1[0])
     self.assertEqual(1, len(self.ctx.emitted[expected_key]))
     unserialized = proto.unserialize_pair(
         self.ctx.emitted[expected_key][0])
     self.assertTrue(unserialized[1] is None)
     self.assertEqual(self.pair1[0].tid, unserialized[0].tid)
     self.assertEqual(self.pair1[0].pos, unserialized[0].pos)
     self.assertTrue(self.ctx.counters.has_key("Test:MAPPED COORDINATES"))
     self.assertEqual(1, self.ctx.counters["Test:MAPPED COORDINATES"])
	def test_unmapped1(self):
		self.pair1[0].set_mapped(False)
		self.pair1[1].set_mate_mapped(False)
		self.link.process(self.pair1)

		self.assertEqual(1, len(self.ctx.emitted.keys()))
		self.assertTrue( test_utils.make_key(self.pair1[1]) in self.ctx.emitted.keys() )
		self.assertEqual(1, len(self.ctx.emitted.values()[0]))
		unserialized = proto.unserialize_pair(self.ctx.emitted.values()[0][0])
		self.assertFalse(unserialized[0] is None)
		self.assertFalse(unserialized[1] is None)
		self.assertEqual(self.pair1[1].tid, unserialized[0].tid)
		self.assertEqual(self.pair1[1].pos, unserialized[0].pos)
		self.assertEqual(1, self.ctx.counters["Test:UNMAPPED READS"])
Ejemplo n.º 11
0
    def test_unmapped1(self):
        self.pair1[0].set_mapped(False)
        self.pair1[1].set_mate_mapped(False)
        self.link.process(self.pair1)

        self.assertEqual(1, len(self.ctx.emitted.keys()))
        self.assertTrue(
            test_utils.make_key(self.pair1[1]) in self.ctx.emitted.keys())
        self.assertEqual(1, len(self.ctx.emitted.values()[0]))
        unserialized = proto.unserialize_pair(self.ctx.emitted.values()[0][0])
        self.assertFalse(unserialized[0] is None)
        self.assertFalse(unserialized[1] is None)
        self.assertEqual(self.pair1[1].tid, unserialized[0].tid)
        self.assertEqual(self.pair1[1].pos, unserialized[0].pos)
        self.assertEqual(1, self.ctx.counters["Test:UNMAPPED READS"])
Ejemplo n.º 12
0
	def reduce(self, ctx):
		# create the "workspace"
		self.__pairs = []
		self.__unpaired = []

		# gather input
		key_values = ctx.getInputKey().split(':')

		if key_values[0] == seqal_app.UNMAPPED_STRING:
			# pair of unmapped sequences
			self.__process_unmapped_pairs(ctx)
		else:
			if len(key_values) != 3:
				raise RuntimeError("Unexpected key length %d.  Expected key format is ref_id:pos:orient" % len(key))
			# convert key values and make it a tuple
			key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R') # last value is True if reverse strand

			have_pairs = False # keep track of whether we have at least one real pair.
			# load mappings
			while ctx.nextValue():
				value = ctx.getInputValue()
				if value == seqal_app.PAIR_STRING:
					have_pairs = True
				else:
					pair = protobuf_mapping.unserialize_pair(value)
					if pair[0] is None or pair[0].is_unmapped():
						# Sanity check. pair[0] should never be None or unmapped here.
						raise ValueError("Error!  Got None or unmapped in first read for key %s.  pair: %s" % (key, pair))

					if pair[1] and pair[1].is_unmapped():
						self.__output_sink.process( (pair[1], None) )
						self.__unpaired.append( (pair[0], None) )
					elif pair[1] is None:
						self.__unpaired.append(pair)
					else:
						# Two mapped reads.
						# pair[0] should never be unmapped.  That case should be handled by
						# __process_unmapped_pairs.
						self.__pairs.append(pair)
						have_pairs = True

			self.__process_pairs()
			self.__process_fragments(have_pairs)

		# clean-up the workspace
		self.__pairs = None
		self.__unpaired = None
	def test_emit_forward_pair(self):
		# We expect to get the pair emitted with the key generated from
		# read 1 (the left read).  On the other hand, we expect to get
		# PAIR_STRING with the key generated from read 2
		self.link.process(self.pair1)
		expected_keys = map(test_utils.make_key, self.pair1)
		for i in 0,1:
			self.assertTrue( self.ctx.emitted.has_key(expected_keys[i]) )
			self.assertEqual(1, len(self.ctx.emitted[expected_keys[i]]))

		unserialized = proto.unserialize_pair(self.ctx.emitted[expected_keys[0]][0])
		for j in 0,1:
			self.assertEqual(self.pair1[j].tid, unserialized[j].tid)
			self.assertEqual(self.pair1[j].pos, unserialized[j].pos)

		second_value = self.ctx.emitted[expected_keys[1]][0]
		self.assertEqual(PAIR_STRING, second_value)
Ejemplo n.º 14
0
    def test_emit_forward_pair(self):
        # We expect to get the pair emitted with the key generated from
        # read 1 (the left read).  On the other hand, we expect to get
        # PAIR_STRING with the key generated from read 2
        self.link.process(self.pair1)
        expected_keys = map(test_utils.make_key, self.pair1)
        for i in 0, 1:
            self.assertTrue(self.ctx.emitted.has_key(expected_keys[i]))
            self.assertEqual(1, len(self.ctx.emitted[expected_keys[i]]))

        unserialized = proto.unserialize_pair(
            self.ctx.emitted[expected_keys[0]][0])
        for j in 0, 1:
            self.assertEqual(self.pair1[j].tid, unserialized[j].tid)
            self.assertEqual(self.pair1[j].pos, unserialized[j].pos)

        second_value = self.ctx.emitted[expected_keys[1]][0]
        self.assertEqual(PAIR_STRING, second_value)
	def test_emit_backward_pair(self):
		# Similar to test_emit_forward_pair, but here we expect to have the reads
		# reordered.  So,
		#   key2 => reversed and serialized pair
		#   key1 => PAIR_STRING
		self.link.process(self.pair2)
		expected_keys = map(test_utils.make_key, self.pair2)
		for i in 0,1:
			self.assertTrue( self.ctx.emitted.has_key(expected_keys[i]) )
			self.assertEqual(1, len(self.ctx.emitted[expected_keys[i]]))

		unserialized = proto.unserialize_pair(self.ctx.emitted[expected_keys[1]][0])
		for j in 0,1:
			self.assertEqual(self.pair2[j].tid, unserialized[j^1].tid)
			self.assertEqual(self.pair2[j].pos, unserialized[j^1].pos)

		second_value = self.ctx.emitted[expected_keys[0]][0]
		self.assertEqual(PAIR_STRING, second_value)
Ejemplo n.º 16
0
    def test_emit_backward_pair(self):
        # Similar to test_emit_forward_pair, but here we expect to have the reads
        # reordered.  So,
        #   key2 => reversed and serialized pair
        #   key1 => PAIR_STRING
        self.link.process(self.pair2)
        expected_keys = map(test_utils.make_key, self.pair2)
        for i in 0, 1:
            self.assertTrue(self.ctx.emitted.has_key(expected_keys[i]))
            self.assertEqual(1, len(self.ctx.emitted[expected_keys[i]]))

        unserialized = proto.unserialize_pair(
            self.ctx.emitted[expected_keys[1]][0])
        for j in 0, 1:
            self.assertEqual(self.pair2[j].tid, unserialized[j ^ 1].tid)
            self.assertEqual(self.pair2[j].pos, unserialized[j ^ 1].pos)

        second_value = self.ctx.emitted[expected_keys[0]][0]
        self.assertEqual(PAIR_STRING, second_value)
Ejemplo n.º 17
0
 def __pipe_pair_through(self, pair):
     message = io.serialize_pair(pair)
     return io.unserialize_pair(message)
Ejemplo n.º 18
0
 def __process_unmapped_pairs(self, ctx):
     while ctx.nextValue():
         value = ctx.getInputValue()
         pair = protobuf_mapping.unserialize_pair(value)
         self.__output_sink.process(pair)
Ejemplo n.º 19
0
 def __pipe_pair_through(self, pair):
     message = io.serialize_pair(pair)
     return io.unserialize_pair(message)
Ejemplo n.º 20
0
	def __process_unmapped_pairs(self, ctx):
		while ctx.nextValue():
			value = ctx.getInputValue()
			pair = protobuf_mapping.unserialize_pair(value)
			self.__output_sink.process(pair)