def test_io(self): bdiff = BdiffIO(self._bdiff_file) self.assertTrue(bdiff.is_read_mode) self.assertDictEqual(self._header, bdiff.header) self.assertEqual(4, bdiff.snv_count) self.assertEqual(3, bdiff.indel_count) self.assertTupleEqual( (1010, True, 0, ZygosityChange.HOMO_TO_HOMO.value, ['C', 'A', 'T', 'G'], ['C', 'A', 'T', 'G'], ()), bdiff.read_record()) self.assertTupleEqual( (1020, True, 1, ZygosityChange.HOMO_TO_HOMO.value, ['G', 'A', 'T', 'C'], ['G', 'A', 'T', 'C'], ()), bdiff.read_record()) self.assertTupleEqual( (1030, False, 0, ZygosityChange.HOMO_TO_HOMO.value, ['AT', 'ATT', 'A'], ['AT', 'ATT', 'A'], ()), bdiff.read_record()) self.assertTupleEqual( (1040, True, 2, ZygosityChange.HOMO_TO_HOMO.value, ['T', 'G', 'A', 'C'], ['T', 'G', 'A', 'C'], ()), bdiff.read_record()) self.assertTupleEqual( (1050, False, 1, ZygosityChange.HOMO_TO_HOMO.value, ['T', 'TT'], ['T', 'TT'], ()), bdiff.read_record()) self.assertTupleEqual( (1060, False, 2, ZygosityChange.HOMO_TO_HOMO.value, ['GCG', 'G', 'GCGCG'], ['GCG', 'G', 'GCGCG'], ()), bdiff.read_record()) self.assertTupleEqual( (1070, True, 3, ZygosityChange.HOMO_TO_HOMO.value, ['G', 'C', 'A', 'T'], ['G', 'C', 'A', 'T'], ()), bdiff.read_record()) self.assertRaises(EOFError, lambda: bdiff.read_record())
def test_mutate_02(self): # EOF VAC case Vac.text2vac(self.RESOURCE_PATH + 'input_02.vac.txt', self.RESOURCE_PATH + 'input_02.vac') bdiff_file = self._mut.mutate( vac_filename=self.RESOURCE_PATH + 'input_02.vac', mut_bam_filename=self.RESOURCE_PATH + 'output_02.bam', secret=self.SECRET, mut_p=0, rng=self._rnd ) self.assertEqual(21, self._mut.stat(BamMutator.STAT_ALIGNMENT_COUNT)) self.assertEqual(13, self._mut.stat(BamMutator.STAT_COVERING_COUNT)) self.assertEqual(7, self._mut.stat(BamMutator.STAT_VAC_COUNT)) self.assertEqual(6, self._mut.stat(BamMutator.STAT_MUT_COUNT)) self.assertEqual(4, self._mut.stat(BamMutator.STAT_DIFF_COUNT)) self.assertEqual(6, self._mut.stat(BamMutator.STAT_ALIGNMENT_MUT_COUNT)) cmn.bam2sam(self.RESOURCE_PATH + 'output_02.bam', self.RESOURCE_PATH + 'output_02.sam') self.assertTrue(filecmp.cmp( self.RESOURCE_PATH + 'desired_02.sam', self.RESOURCE_PATH + 'output_02.sam' )) BdiffIO.to_text_file(bdiff_file, self.RESOURCE_PATH + 'output_02.diff.txt') is_equal = filecmp.cmp(self.RESOURCE_PATH + 'desired_02.diff.txt', self.RESOURCE_PATH + 'output_02.diff.txt') self.assertTrue(is_equal)
def _write_bdiff_record(file: BdiffIO, index, ref_id, perm, is_snv: bool = True): if is_snv: file.write_snv(index, ref_id, ZygosityChange.HOMO_TO_HOMO, perm, perm, ()) else: file._write_indel(index, ref_id, perm)
def setUpClass(cls): bdiff = BdiffIO() cls._write_bdiff_record(bdiff, 1010, 0, ['C', 'A', 'T', 'G']) cls._write_bdiff_record(bdiff, 1020, 1, ['G', 'A', 'T', 'C']) cls._write_bdiff_record(bdiff, 1030, 0, ['AT', 'ATT', 'A'], False) cls._write_bdiff_record(bdiff, 1040, 2, ['T', 'G', 'A', 'C']) cls._write_bdiff_record(bdiff, 1050, 1, ['T', 'TT'], False) cls._write_bdiff_record(bdiff, 1060, 2, ['GCG', 'G', 'GCGCG'], False) cls._write_bdiff_record(bdiff, 1070, 3, ['G', 'C', 'A', 'T']) cls._bdiff_file = bdiff.file(cls._header)
def test_single_file(self): wbdiff = BdiffIO() self._write_bdiff_record(wbdiff, 1010, 0, ['C', 'A', 'T', 'G']) rbdiff = wbdiff.readcopy() self.assertFalse(rbdiff.is_empty()) self.assertEqual(1010, rbdiff.first_index) self.assertEqual(1010, rbdiff.last_index) self.assertEqual(rbdiff.data_offset, rbdiff.tell_index_gte(1000)) self.assertIsNone(rbdiff.tell_index_gte(2000)) self.assertIsNone(rbdiff.tell_index_lte(1000)) self.assertEqual(rbdiff.data_offset, rbdiff.tell_index_lte(2000))
def test_seq_perm(self): self.assertListEqual(['T', 'TT', 'A', 'AA'], BdiffIO.seq_perm({ 'T': 'A', 'AA': 'TT', 'TT': 'AA', 'A': 'T' })) self.assertListEqual(['C', 'A', 'T', 'G'], BdiffIO.seq_perm({ 'A': 'C', 'T': 'G', 'G': 'T', 'C': 'A' }))
def _record2pos(self, record_id: int): """ Record with record_id must be present. :param record_id: 1-based :return: """ saved_pos = self._bdiff_file.tell() bdiff = BdiffIO(self._bdiff_file) curr_id = 1 while curr_id < record_id: bdiff.read_record() curr_id += 1 record_pos = self._bdiff_file.tell() self._bdiff_file.seek(saved_pos) return record_pos
def _index2pos(bdiff: BdiffIO, search_index: int): """ Record with search_index must be present. :param bdiff: :param search_index: :return: """ saved_pos = bdiff.tell() pos = saved_pos while True: index = bdiff.read_record()[0] if index == search_index: break pos = bdiff.tell() bdiff.seek(saved_pos) return pos
def test_to_text_file(self): bdiff = BdiffIO() self._write_bdiff_record(bdiff, 11011, 2, ['A', 'G', 'T', 'C']) bdiff.write_snv(11015, 3, ZygosityChange.HOMO_TO_HETERO, ['C', 'A', 'T', 'G'], ['C', 'A', 'G', 'T'], beta_indices=[2, 4, 6]) self._write_bdiff_record(bdiff, 11020, 2, ['G', 'T', 'A', 'C']) self._write_bdiff_record(bdiff, 11027, 2, ['A', 'AGT', 'AG'], False) self._write_bdiff_record(bdiff, 11031, 2, ['G', 'A', 'T', 'C']) self._write_bdiff_record(bdiff, 11037, 0, ['GCG', 'G'], False) BdiffIO.to_text_file( bdiff.file({ "_from_index": 0, "_to_index": 19999, "mb_checksum": "f05027eaa1923a6aab76bf7ead4fe976", "secret": "ffffffffffffffffffffffffffffffff" }), self.RESOURCE_PATH + 'output.diff.txt') self.assertTrue( filecmp.cmp(self.RESOURCE_PATH + 'input.diff.txt', self.RESOURCE_PATH + 'output.diff.txt'))
def test_mask(self): # EOF BAM case Vac.text2vac(self.RESOURCE_PATH + 'input.vac.txt', self.RESOURCE_PATH + 'input.vac') bdiff_file = self._mut.mutate( vac_filename=self.RESOURCE_PATH + 'input.vac', mut_bam_filename=self.RESOURCE_PATH + 'output.bam', secret=self.SECRET, mut_p=0, rng=self._rng) cmn.bam2sam(self.RESOURCE_PATH + 'output.bam', self.RESOURCE_PATH + 'output.sam') BdiffIO.to_text_file(bdiff_file, self.RESOURCE_PATH + 'output.diff.txt') self.assertTrue( filecmp.cmp(self.RESOURCE_PATH + 'desired.sam', self.RESOURCE_PATH + 'output.sam')) self.assertTrue( filecmp.cmp(self.RESOURCE_PATH + 'desired.diff.txt', self.RESOURCE_PATH + 'output.diff.txt'))
def test_unmask(self): # all mapped with BdiffIO.from_text_file(self.RESOURCE_PATH + 'input.diff.txt') as bdiff_file: self.mut.unmutate(bdiff_file=bdiff_file, out_bam_filename=self.RESOURCE_PATH + 'output.bam') bam2sam(self.RESOURCE_PATH + 'output.bam', self.RESOURCE_PATH + 'output.sam') is_equal = filecmp.cmp(self.RESOURCE_PATH + 'desired.sam', self.RESOURCE_PATH + 'output.sam') self.assertEqual(True, is_equal)
def _mask_indel_pos(self, bdiff_io: bdiff.BdiffIO, allele_queue: list, variant: VariantOccurrence, rnd: VeryRandom) -> bool: # # TODO # return False pileup = pileup_alleles(allele_queue) alt_freq_map = cmn.freq_map(pileup) mut_map = cmn.indel_mut_map(private_freq_map=alt_freq_map, public_freq_map=dict( zip(variant.alleles, variant.freqs)), rnd=rnd) # # BEGIN temp code # personal_allele = sorted(alt_freq_map.keys(), key=lambda key: alt_freq_map[key], reverse=True)[0] # # personal allele is mutated # ref_id = variant.alleles.index(variant.ref_allele) # max_id = variant.alleles.index(personal_allele) # type: int # personal_freq = variant.freqs[max_id] / sum(variant.freqs) # # is_mutated = personal_allele != mut_map[personal_allele] # is_reference = ref_id == max_id # # record = '%d\t%f\t%d\t%d\n' % ( # variant.pos.index, personal_freq, is_reference, is_mutated # ) # self._freqs_file.write(record) # # END temp code is_masked = False for allele in allele_queue: # type: AlleleAlignment is_masked |= self._mutate_allele(allele, mut_map) if is_masked: bdiff_io.write_indel(variant.pos.index, variant.ref_allele, mut_map) return is_masked
def test_empty_file(self): # empty file bdiff = BdiffIO().readcopy() self.assertTrue(bdiff.is_empty()) self.assertIsNone(bdiff.first_index) self.assertIsNone(bdiff.last_index) self.assertIsNone(bdiff.tell_index_gte(0)) self.assertIsNone(bdiff.tell_index_gte(1000)) self.assertIsNone(bdiff.tell_index_lte(0)) self.assertIsNone(bdiff.tell_index_lte(1000))
def test_from_text_file(self): bdiff_file = BdiffIO.from_text_file(self.RESOURCE_PATH + 'input.diff.txt') bdiff = BdiffIO(bdiff_file) self.assertDictEqual( { "_from_index": 0, "_to_index": 19999, "mb_checksum": "f05027eaa1923a6aab76bf7ead4fe976", "secret": "ffffffffffffffffffffffffffffffff" }, bdiff.header) self.assertEqual(1000, bdiff.index_resolution) self.assertEqual(11011, bdiff.first_index) self.assertEqual(11037, bdiff.last_index) self.assertEqual(4, bdiff.snv_count) self.assertEqual(2, bdiff.indel_count) self.assertTupleEqual( (11011, True, 2, ZygosityChange.HOMO_TO_HOMO.value, ['A', 'G', 'T', 'C'], ['A', 'G', 'T', 'C'], ()), bdiff.read_record()) self.assertTupleEqual( (11015, True, 3, ZygosityChange.HOMO_TO_HETERO.value, ['C', 'A', 'T', 'G'], ['C', 'A', 'G', 'T'], (2, 4, 6)), bdiff.read_record()) self.assertTupleEqual( (11020, True, 2, ZygosityChange.HOMO_TO_HOMO.value, ['G', 'T', 'A', 'C'], ['G', 'T', 'A', 'C'], ()), bdiff.read_record()) self.assertTupleEqual( (11027, False, 2, ZygosityChange.HOMO_TO_HOMO.value, ['A', 'AGT', 'AG'], ['A', 'AGT', 'AG'], ()), bdiff.read_record()) self.assertTupleEqual( (11031, True, 2, ZygosityChange.HOMO_TO_HOMO.value, ['G', 'A', 'T', 'C'], ['G', 'A', 'T', 'C'], ()), bdiff.read_record()) self.assertTupleEqual( (11037, False, 0, ZygosityChange.HOMO_TO_HOMO.value, ['GCG', 'G'], ['GCG', 'G'], ()), bdiff.read_record())
def test_unmutate_01(self): # all mapped with BdiffIO.from_text_file(self.RESOURCE_PATH + 'input.diff.txt') as bdiff_file: self.mut.unmutate(bdiff_file=bdiff_file, out_bam_filename=self.RESOURCE_PATH + 'output_01.bam') self.assertEqual(17, self.mut.stat(BamMutator.STAT_ALIGNMENT_COUNT)) self.assertEqual(11, self.mut.stat(BamMutator.STAT_COVERING_COUNT)) self.assertEqual(8, self.mut.stat(BamMutator.STAT_MUT_COUNT)) self.assertEqual(5, self.mut.stat(BamMutator.STAT_DIFF_COUNT)) self.assertEqual(8, self.mut.stat(BamMutator.STAT_ALIGNMENT_MUT_COUNT)) bam2sam(self.RESOURCE_PATH + 'output_01.bam', self.RESOURCE_PATH + 'output_01.sam') is_equal = filecmp.cmp(self.RESOURCE_PATH + 'desired_01.sam', self.RESOURCE_PATH + 'output_01.sam') self.assertEqual(True, is_equal)
def test_unmutate_07(self): # include unmapped with BdiffIO.from_text_file(self.RESOURCE_PATH + 'input.diff.txt') as bdiff_file: self.mut.unmutate(bdiff_file=bdiff_file, out_bam_filename=self.RESOURCE_PATH + 'output_07.bam', unmapped_only=True) self.assertEqual(4, self.mut.stat(BamMutator.STAT_ALIGNMENT_COUNT)) self.assertEqual(0, self.mut.stat(BamMutator.STAT_COVERING_COUNT)) self.assertEqual(0, self.mut.stat(BamMutator.STAT_MUT_COUNT)) # only the first diff record is read self.assertEqual(1, self.mut.stat(BamMutator.STAT_DIFF_COUNT)) self.assertEqual(0, self.mut.stat(BamMutator.STAT_ALIGNMENT_MUT_COUNT)) bam2sam(self.RESOURCE_PATH + 'output_07.bam', self.RESOURCE_PATH + 'output_07.sam') self.assertTrue( filecmp.cmp(self.RESOURCE_PATH + 'desired_07.sam', self.RESOURCE_PATH + 'output_07.sam'))
def test_unmutate_05(self): # include unmapped with BdiffIO.from_text_file(self.RESOURCE_PATH + 'input.diff.txt') as bdiff_file: self.mut.unmutate(bdiff_file=bdiff_file, out_bam_filename=self.RESOURCE_PATH + 'output_05.bam', include_unmapped=True) self.assertEqual(21, self.mut.stat(BamMutator.STAT_ALIGNMENT_COUNT)) # one synonymous mutation is present self.assertEqual(11, self.mut.stat(BamMutator.STAT_COVERING_COUNT)) self.assertEqual(8, self.mut.stat(BamMutator.STAT_MUT_COUNT)) self.assertEqual(5, self.mut.stat(BamMutator.STAT_DIFF_COUNT)) self.assertEqual(8, self.mut.stat(BamMutator.STAT_ALIGNMENT_MUT_COUNT)) bam2sam(self.RESOURCE_PATH + 'output_05.bam', self.RESOURCE_PATH + 'output_05.sam') self.assertTrue( filecmp.cmp(self.RESOURCE_PATH + 'desired_05.sam', self.RESOURCE_PATH + 'output_05.sam'))
def test_unmutate_06(self): # include unmapped with range with BdiffIO.from_text_file(self.RESOURCE_PATH + 'input.diff.txt') as bdiff_file: self.mut.unmutate( bdiff_file=bdiff_file, out_bam_filename=self.RESOURCE_PATH + 'output_06.bam', start_ref_name='chr2', start_ref_pos=1015, # inclusive end_ref_name='chr2', end_ref_pos=1028, # non-inclusive include_unmapped=True) self.assertEqual(13, self.mut.stat(BamMutator.STAT_ALIGNMENT_COUNT)) self.assertEqual(3, self.mut.stat(BamMutator.STAT_COVERING_COUNT)) self.assertEqual(2, self.mut.stat(BamMutator.STAT_MUT_COUNT)) self.assertEqual(1, self.mut.stat(BamMutator.STAT_DIFF_COUNT)) self.assertEqual(2, self.mut.stat(BamMutator.STAT_ALIGNMENT_MUT_COUNT)) bam2sam(self.RESOURCE_PATH + 'output_06.bam', self.RESOURCE_PATH + 'output_06.sam') self.assertTrue( filecmp.cmp(self.RESOURCE_PATH + 'desired_06.sam', self.RESOURCE_PATH + 'output_06.sam'))
def test_file(self): """ Testing only branch with inner method BdiffIO._file_from_slice. """ bdiff = BdiffIO(self._bdiff_file) # invalid range self.assertRaises( AssertionError, lambda: bdiff.file( { BdiffIO.FROM_INDEX: 2000, BdiffIO.TO_INDEX: 1000 }, False)) # out of range self.assertRaises( IndexError, lambda: bdiff.file( { BdiffIO.FROM_INDEX: 1071, BdiffIO.TO_INDEX: 2000 }, False)) self.assertRaises( IndexError, lambda: bdiff.file({ BdiffIO.FROM_INDEX: 0, BdiffIO.TO_INDEX: 1009 }, False)) # full range bdiff_io = BdiffIO( bdiff.file({ BdiffIO.FROM_INDEX: 0, BdiffIO.TO_INDEX: 2000 }, False)) self.assertDictEqual({ BdiffIO.FROM_INDEX: 0, BdiffIO.TO_INDEX: 2000 }, bdiff_io.header) self.assertEqual(1010, bdiff_io.first_index) self.assertEqual(1070, bdiff_io.last_index) self.assertEqual(4, bdiff_io.snv_count) self.assertEqual(3, bdiff_io.indel_count) # exact range bdiff_io = BdiffIO( bdiff.file({ BdiffIO.FROM_INDEX: 1010, BdiffIO.TO_INDEX: 1070 }, False)) self.assertDictEqual({ BdiffIO.FROM_INDEX: 1010, BdiffIO.TO_INDEX: 1070 }, bdiff_io.header) self.assertEqual(1010, bdiff_io.first_index) self.assertEqual(1070, bdiff_io.last_index) self.assertEqual(4, bdiff_io.snv_count) self.assertEqual(3, bdiff_io.indel_count) # inner range bdiff_io = BdiffIO( bdiff.file({ BdiffIO.FROM_INDEX: 1020, BdiffIO.TO_INDEX: 1060 }, False)) self.assertDictEqual({ BdiffIO.FROM_INDEX: 1020, BdiffIO.TO_INDEX: 1060 }, bdiff_io.header) self.assertEqual(1020, bdiff_io.first_index) self.assertEqual(1060, bdiff_io.last_index) self.assertEqual(2, bdiff_io.snv_count) self.assertEqual(3, bdiff_io.indel_count) # left intersect range bdiff_io = BdiffIO( bdiff.file({ BdiffIO.FROM_INDEX: 0, BdiffIO.TO_INDEX: 1020 }, False)) self.assertDictEqual({ BdiffIO.FROM_INDEX: 0, BdiffIO.TO_INDEX: 1020 }, bdiff_io.header) self.assertEqual(1010, bdiff_io.first_index) self.assertEqual(1020, bdiff_io.last_index) self.assertEqual(2, bdiff_io.snv_count) self.assertEqual(0, bdiff_io.indel_count) bdiff_io = BdiffIO( bdiff.file({ BdiffIO.FROM_INDEX: 0, BdiffIO.TO_INDEX: 1010 }, False)) self.assertDictEqual({ BdiffIO.FROM_INDEX: 0, BdiffIO.TO_INDEX: 1010 }, bdiff_io.header) self.assertEqual(1010, bdiff_io.first_index) self.assertEqual(1010, bdiff_io.last_index) self.assertEqual(1, bdiff_io.snv_count) self.assertEqual(0, bdiff_io.indel_count) # right intersect range bdiff_io = BdiffIO( bdiff.file({ BdiffIO.FROM_INDEX: 1060, BdiffIO.TO_INDEX: 2000 }, False)) self.assertDictEqual({ BdiffIO.FROM_INDEX: 1060, BdiffIO.TO_INDEX: 2000 }, bdiff_io.header) self.assertEqual(1060, bdiff_io.first_index) self.assertEqual(1070, bdiff_io.last_index) self.assertEqual(1, bdiff_io.snv_count) self.assertEqual(1, bdiff_io.indel_count) bdiff_io = BdiffIO( bdiff.file({ BdiffIO.FROM_INDEX: 1070, BdiffIO.TO_INDEX: 2000 }, False)) self.assertDictEqual({ BdiffIO.FROM_INDEX: 1070, BdiffIO.TO_INDEX: 2000 }, bdiff_io.header) self.assertEqual(1070, bdiff_io.first_index) self.assertEqual(1070, bdiff_io.last_index) self.assertEqual(1, bdiff_io.snv_count) self.assertEqual(0, bdiff_io.indel_count)
def test_tell_range(self): bdiff = BdiffIO(self._bdiff_file) # invalid range self.assertRaises(AssertionError, lambda: bdiff.tell_range(2000, 1000)) # outer range self.assertRaises(IndexError, lambda: bdiff.tell_range(0, 1000)) self.assertRaises(IndexError, lambda: bdiff.tell_range(2000, 3000)) self.assertRaises(IndexError, lambda: bdiff.tell_range(0, 1009)) self.assertRaises(IndexError, lambda: bdiff.tell_range(1071, 2000)) bdiff = BdiffIO(self._bdiff_file) # inner range self.assertTupleEqual(self._range2pos(1, 7), bdiff.tell_range(1000, 2000)) self.assertTupleEqual(self._range2pos(1, 7), bdiff.tell_range(1010, 1070)) self.assertTupleEqual(self._range2pos(1, 2), bdiff.tell_range(1010, 1020)) self.assertTupleEqual(self._range2pos(2, 3), bdiff.tell_range(1020, 1030)) self.assertTupleEqual(self._range2pos(3, 4), bdiff.tell_range(1030, 1040)) self.assertTupleEqual(self._range2pos(1, 1), bdiff.tell_range(1010, 1019)) self.assertTupleEqual(self._range2pos(1, 1), bdiff.tell_range(1010, 1010)) self.assertTupleEqual(self._range2pos(2, 6), bdiff.tell_range(1011, 1069)) # right outer range self.assertTupleEqual(self._range2pos(1, 7), bdiff.tell_range(1010, 3000)) self.assertTupleEqual(self._range2pos(2, 7), bdiff.tell_range(1020, 3000)) self.assertTupleEqual(self._range2pos(3, 7), bdiff.tell_range(1030, 3000)) # left outer range self.assertTupleEqual(self._range2pos(1, 1), bdiff.tell_range(1000, 1010)) self.assertTupleEqual(self._range2pos(1, 2), bdiff.tell_range(1000, 1020)) self.assertTupleEqual(self._range2pos(1, 3), bdiff.tell_range(1000, 1030))
def test_tell_index_lte(self): bdiff = BdiffIO(self._bdiff_file) # one before first self.assertIsNone(bdiff.tell_index_lte(1008)) # first self.assertEqual(self._record2pos(1), bdiff.tell_index_lte(1010)) # one after first self.assertEqual(self._record2pos(1), bdiff.tell_index_lte(1011)) # between first and second self.assertEqual(self._record2pos(1), bdiff.tell_index_lte(1015)) # on before second self.assertEqual(self._record2pos(1), bdiff.tell_index_lte(1019)) # second self.assertEqual(self._record2pos(2), bdiff.tell_index_lte(1020)) # one after second self.assertEqual(self._record2pos(2), bdiff.tell_index_lte(1021)) # one before third self.assertEqual(self._record2pos(2), bdiff.tell_index_lte(1029)) # third self.assertEqual(self._record2pos(3), bdiff.tell_index_lte(1030)) # one after third self.assertEqual(self._record2pos(3), bdiff.tell_index_lte(1031)) # one before last self.assertEqual(self._record2pos(6), bdiff.tell_index_lte(1069)) # last self.assertEqual(self._record2pos(7), bdiff.tell_index_lte(1070)) # one after last self.assertEqual(self._record2pos(7), bdiff.tell_index_lte(1071))
def reencrypt(self, rsa_key: RSA, rsa_enc_key: RSA, bam_filename: str, enc_diff_filename: str, out_enc_diff_filename: str, start_ref_name: str = None, start_ref_pos: int = None, end_ref_name: str = None, end_ref_pos: int = None, include_unmapped: bool = False, unmapped_only: bool = False, rsa_ver_key: RSA = None): """ Reencrypt DIFF file with supplied public_key. Output formats: .diff.enc :param rsa_key: private key to decrypt DIFF and sign new DIFF :param rsa_enc_key: public key to encrypt new DIFF :param bam_filename: mutated BAM :param enc_diff_filename: diff to reencrypt :param out_enc_diff_filename: reencrypted DIFF :param start_ref_name: inclusive :param start_ref_pos: 0-based, inclusive :param end_ref_name: inclusive :param end_ref_pos: 0-based, inclusive :param include_unmapped: Include all unplaced unmapped reads. :param unmapped_only: Only unmapped reads - both placed and unplaced. Overrides other parameters. :param rsa_ver_key: RSA key with public key to verify DIFF """ # TODO verify if bam_filename is mutated # make sure that BAM is indexed pysam.index(bam_filename) with open(enc_diff_filename, 'rb') as enc_diff_file, \ io.BytesIO() as diff_file: aes_key = self._read_aes_key(enc_diff_file, rsa_key) signature = self._read_signature(enc_diff_file) self._decrypt(enc_diff_file, aes_key, diff_file) self._verify(diff_file, signature, rsa_ver_key) bdiff = BdiffIO(diff_file) bam_mut = BamMutator(bam_filename) if bam_mut.checksum != bdiff.header.get( BamMutator.BDIFF_CHECKSUM_TAG): # checksum mismatch raise ValueError( "Provided BDIFF is not associated with this BAM." " Reason: checksum mismatch.") from_index, to_index = bam_mut.resolve_range( bdiff.header[BdiffIO.FROM_INDEX], bdiff.header[BdiffIO.TO_INDEX], start_ref_name, start_ref_pos, end_ref_name, end_ref_pos) # use actual effective range bdiff.header[BdiffIO.FROM_INDEX] = from_index bdiff.header[BdiffIO.TO_INDEX] = to_index if (unmapped_only or include_unmapped ) and BamMutator.BDIFF_SECRET_TAG not in bdiff.header: raise ValueError( 'BDIFF must contain secret to decrypt unmapped reads.') if unmapped_only: del bdiff.header[BdiffIO.FROM_INDEX] del bdiff.header[BdiffIO.TO_INDEX] out_diff = bdiff.file(bdiff.header) elif include_unmapped: out_diff = bdiff.file(bdiff.header) else: # mapped only del bdiff.header[BamMutator.BDIFF_SECRET_TAG] out_diff = bdiff.file(bdiff.header) with out_diff, open(out_enc_diff_filename, 'wb') as out_enc_diff_file: out_signature = self._sign(out_diff, rsa_key) self._write_aes_key(out_enc_diff_file, aes_key, rsa_enc_key) self._write_signature(out_enc_diff_file, out_signature) self._encrypt(out_diff, aes_key, out_enc_diff_file)
def _mask_snv_pos(self, bdiff_io: bdiff.BdiffIO, allele_queue: List[AlleleAlignment], variant: VariantOccurrence, rng: VeryRandom) -> bool: is_masked = False from_allele_a, from_allele_b = self._private_allele_pair(allele_queue) if from_allele_a is not None and from_allele_b is not None: # personal alleles are found to_allele_a, to_allele_b = self._public_allele_pair( variant.freqs, rng) mask_map_a, mask_map_b, zygosity = self._create_masking( from_allele_a, from_allele_b, to_allele_a, to_allele_b) # beta_indices = [] if zygosity == ZygosityChange.HOMO_TO_HETERO: for i in range(len(allele_queue)): aligned_allele = allele_queue[i] # type: AlleleAlignment # use random masking map from the pair if rng.random() < 0.5: is_masked |= self._mutate_allele( aligned_allele, mask_map_a) else: is_masked |= self._mutate_allele( aligned_allele, mask_map_b) beta_indices.append(i) elif zygosity == ZygosityChange.HETERO_TO_HOMO: for i in range(len(allele_queue)): aligned_allele = allele_queue[i] # type: AlleleAlignment if aligned_allele.allele == from_allele_b: # secondary allele is_masked |= self._mutate_allele( aligned_allele, mask_map_b) beta_indices.append(i) else: is_masked |= self._mutate_allele( aligned_allele, mask_map_a) else: # zygosity is preserved for i in range(len(allele_queue)): aligned_allele = allele_queue[i] # type: AlleleAlignment is_masked |= self._mutate_allele(aligned_allele, mask_map_a) if is_masked: # at least one alignment has been masked bdiff_io.write_snv(index=variant.pos.index, ref_id=cmn.BASES.index(variant.ref_allele), zygosity=zygosity, perm_a=[ mask_map_a['A'], mask_map_a['T'], mask_map_a['G'], mask_map_a['C'] ], perm_b=[ mask_map_b['A'], mask_map_b['T'], mask_map_b['G'], mask_map_b['C'] ], beta_indices=beta_indices) return is_masked
def unmutate(self, bdiff_file: io.BytesIO, out_bam_filename: str, start_ref_name: str = None, start_ref_pos: int = None, end_ref_name: str = None, end_ref_pos: int = None, include_unmapped: bool = False, unmapped_only: bool = False): """ Unmutate BAM file in range specified by DIFF file or by parameters. :param rng: :param bdiff_file: :param out_bam_filename: :param start_ref_name: inclusive :param start_ref_pos: 0-based, inclusive :param end_ref_name: inclusive :param end_ref_pos: 0-based, inclusive :param include_unmapped: Include all unplaced unmapped reads. :param unmapped_only: Only unmapped reads - both placed and unplaced. Overrides other parameters. When range is supplied partialy covered reads are also included, but only variants within range are unmutated. """ self._stats = {} with bam.open_bam(self._bam_filename, 'rb') as bam_file: # type: pysam.AlignmentFile header = bam.unmut_header(bam_file.header) mut = Mutator(fai=self._fai, verbose=self._verbose) with bam.open_bam(out_bam_filename, 'wb', header=header) as out_bam_file: bdiff_io = BdiffIO(bdiff_file) if self._verbose: print('SNV diff count %d' % bdiff_io.snv_count) print('INDEL diff count %d' % bdiff_io.indel_count) secret = self.extract_secret_bytes(bdiff_io) if (include_unmapped or unmapped_only) and secret is None: raise ValueError( 'BDIFF must contain secret to decrypt unmapped reads.') # validate checksum if self.checksum != bdiff_io.header[self.BDIFF_CHECKSUM_TAG]: print(self.checksum) print(bdiff_io.header[self.BDIFF_CHECKSUM_TAG]) raise ValueError('BDIFF does not refer to this BAM') # TODO user friendly exception on missing bdiff_io header value start_index, end_index = self.resolve_range( bdiff_from_index=bdiff_io.header[BdiffIO.FROM_INDEX], bdiff_to_index=bdiff_io.header[BdiffIO.TO_INDEX], start_ref_name=start_ref_name, start_ref_pos=start_ref_pos, end_ref_name=end_ref_name, end_ref_pos=end_ref_pos) # TODO move iterators to with statement mut.unmutate( bam_iter=iters.bam_iterator(self._bam_filename, start_index, end_index, unmapped_only, include_unmapped), bdiff_iter=iters.BdiffIterator(bdiff_io=bdiff_io, fai=self._fai, start_index=start_index, end_index=end_index), out_bam_file=out_bam_file, secret=secret) self._stats = { self.STAT_ALIGNMENT_COUNT: mut.alignment_counter, self.STAT_COVERING_COUNT: mut.covering_counter, self.STAT_MUT_COUNT: mut.mut_counter, self.STAT_DIFF_COUNT: mut.diff_counter, self.STAT_ALIGNMENT_MUT_COUNT: mut.alignment_mut_counter }
def test_file_index(self): write_io = BdiffIO(index_resolution=3) self._write_bdiff_record(write_io, 1000, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1010, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1020, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1030, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1040, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1050, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1060, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1070, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1080, 0, ['C', 'A', 'T', 'G']) self._write_bdiff_record(write_io, 1090, 0, ['C', 'A', 'T', 'G']) read_io = BdiffIO(write_io.file()) self.assertEqual(1000, read_io.first_index) self.assertEqual(1090, read_io.last_index) self.assertEqual(10, read_io.snv_count) self.assertEqual(0, read_io.indel_count) self.assertEqual(True, read_io.is_read_mode) self.assertEqual(3, read_io.index_resolution) self.assertListEqual([(1020, 16), (1050, 40), (1080, 64)], read_io.file_index) self.assertEqual(self._index2pos(read_io, 1000), read_io._indexed_pos(0)) self.assertEqual(self._index2pos(read_io, 1000), read_io._indexed_pos(1000)) self.assertEqual(self._index2pos(read_io, 1000), read_io._indexed_pos(1019)) self.assertEqual(self._index2pos(read_io, 1020), read_io._indexed_pos(1020)) self.assertEqual(self._index2pos(read_io, 1020), read_io._indexed_pos(1021)) self.assertEqual(self._index2pos(read_io, 1020), read_io._indexed_pos(1049)) self.assertEqual(self._index2pos(read_io, 1050), read_io._indexed_pos(1050)) self.assertEqual(self._index2pos(read_io, 1000), read_io._indexed_pos(1010))