def test_mutate_02(self): # EOF VAC case Vac.text2vac(self.RESOURCE_PATH + 'input_02.vac.txt', self.RESOURCE_PATH + 'input_02.vac') bdiff_file = self._mut.mutate( vac_filename=self.RESOURCE_PATH + 'input_02.vac', mut_bam_filename=self.RESOURCE_PATH + 'output_02.bam', secret=self.SECRET, mut_p=0, rng=self._rnd ) self.assertEqual(21, self._mut.stat(BamMutator.STAT_ALIGNMENT_COUNT)) self.assertEqual(13, self._mut.stat(BamMutator.STAT_COVERING_COUNT)) self.assertEqual(7, self._mut.stat(BamMutator.STAT_VAC_COUNT)) self.assertEqual(6, self._mut.stat(BamMutator.STAT_MUT_COUNT)) self.assertEqual(4, self._mut.stat(BamMutator.STAT_DIFF_COUNT)) self.assertEqual(6, self._mut.stat(BamMutator.STAT_ALIGNMENT_MUT_COUNT)) cmn.bam2sam(self.RESOURCE_PATH + 'output_02.bam', self.RESOURCE_PATH + 'output_02.sam') self.assertTrue(filecmp.cmp( self.RESOURCE_PATH + 'desired_02.sam', self.RESOURCE_PATH + 'output_02.sam' )) BdiffIO.to_text_file(bdiff_file, self.RESOURCE_PATH + 'output_02.diff.txt') is_equal = filecmp.cmp(self.RESOURCE_PATH + 'desired_02.diff.txt', self.RESOURCE_PATH + 'output_02.diff.txt') self.assertTrue(is_equal)
def test1_encrypt(self): cmn.sam2bam(self.RESOURCE_PATH + 'encrypt/input.sam', self.RESOURCE_PATH + 'encrypt/input.bam') pysam.index(self.RESOURCE_PATH + 'encrypt/input.bam') Vac.text2vac(self.RESOURCE_PATH + 'encrypt/input.vac.txt', self.RESOURCE_PATH + 'encrypt/input.vac') with open(self.RESOURCE_PATH + 'admin', 'r') as key_file, \ open(self.RESOURCE_PATH + 'admin.pub', 'r') as pub_key_file: rsa_key = RSA.importKey(key_file.read(), passphrase=self.KEY_PASS) rsa_pub_key = RSA.importKey(pub_key_file.read()) # creates DIFF with secret self.locker.encrypt( rsa_sign_key=rsa_key, rsa_enc_key=rsa_pub_key, bam_filename=self.RESOURCE_PATH + 'encrypt/input.bam', vac_filename=self.RESOURCE_PATH + 'encrypt/input.vac', out_bam_filename=self.RESOURCE_PATH + 'encrypt/output.mut.bam', out_enc_diff_filename=self.RESOURCE_PATH + 'encrypt/output.diff.enc', mut_p=0) pysam.index(self.RESOURCE_PATH + 'encrypt/output.mut.bam') cmn.bam2sam(self.RESOURCE_PATH + 'encrypt/output.mut.bam', self.RESOURCE_PATH + 'encrypt/output.mut.sam')
def create_vac(self, bam_filename: str, vcf_filename: str, out_vac_filename: str, ref_fasta_filename: str, skip_indels: bool): """ BAM and VCF should use same reference genome. VCF must contain INFO column with sub-fields AC and AN. :param bam_filename: filename of the SAM/BAM file, from which the header is extracted :param vcf_filename: filename of the input VCF file :param out_vac_filename: filename of the output VAC file :param ref_fasta_filename: filename to reference FASTA file :param skip_indels: whether to skip indels and keep only SNPs """ # TODO use fasta index / vcf header instead of BAM header # load the reference FASTA ref_fasta = None if ref_fasta_filename is not None: if self._verbose: print('--- Loading Reference Fasta ---') ref_fasta = pyfaidx.Fasta(ref_fasta_filename) # is VCF gzipped? # is_gzipped = vcf_filename.endswith(('.gz', '.bgz')) # open all files and create the VAC file if self._verbose: print('--- Processing VCF %s ---' % vcf_filename) with pysam.VariantFile(vcf_filename) as vcf_file, \ open_bam(bam_filename, 'rb') as sam_file, \ open(out_vac_filename, 'wb') as out_vac_file: vac = Vac(FastaIndex.from_bam(sam_file), self._verbose) vac.vcf2vac(vcf_file, out_vac_file, ref_fasta, skip_indels)
def vac2df(filename: str) -> pd.DataFrame: new_filename = os.path.splitext(filename)[0] + '.txt' Vac.vac2text(filename, new_filename) return pd.read_csv(new_filename, sep='\t', skiprows=2, index_col=0, names=['ref_id', 'counts'])
def test_mask(self): # EOF BAM case Vac.text2vac(self.RESOURCE_PATH + 'input.vac.txt', self.RESOURCE_PATH + 'input.vac') bdiff_file = self._mut.mutate( vac_filename=self.RESOURCE_PATH + 'input.vac', mut_bam_filename=self.RESOURCE_PATH + 'output.bam', secret=self.SECRET, mut_p=0, rng=self._rng) cmn.bam2sam(self.RESOURCE_PATH + 'output.bam', self.RESOURCE_PATH + 'output.sam') BdiffIO.to_text_file(bdiff_file, self.RESOURCE_PATH + 'output.diff.txt') self.assertTrue( filecmp.cmp(self.RESOURCE_PATH + 'desired.sam', self.RESOURCE_PATH + 'output.sam')) self.assertTrue( filecmp.cmp(self.RESOURCE_PATH + 'desired.diff.txt', self.RESOURCE_PATH + 'output.diff.txt'))
def test_io(self): vac_file = self.__build_vac_file() self.assertTupleEqual((3, 3), Vac.read_header(vac_file)) self.assertTupleEqual((2000000000, 0, [3, 2, 1, 0]), Vac.read_snv_record(vac_file)) self.assertTupleEqual((2000000002, 1, [0, 1, 2, 3]), Vac.read_snv_record(vac_file)) self.assertTupleEqual((2000000004, 2, [1, 1, 1, 1]), Vac.read_snv_record(vac_file)) self.assertTupleEqual((2000000001, [10, 1], ['A', 'ATCG']), Vac.read_indel_record(vac_file)) self.assertTupleEqual((2000000003, [10, 1], ['AT', 'ATCGT']), Vac.read_indel_record(vac_file)) self.assertTupleEqual((2000000005, [10, 10, 0], ['AAAA', 'ATCG', 'A']), Vac.read_indel_record(vac_file)) self.assertRaises(EOFError, lambda: Vac.read_snv_record(vac_file))
def _read_snv(self) -> po.VariantOccurrence: if self._snv_count > 0: self._snv_count -= 1 index, ref_id, freqs = Vac.read_snv_record(self._snv_file) ref_name, ref_pos = self._fai.index2pos(index) variant = po.VariantOccurrence(position=po.GenomicPosition( index, ref_name, ref_pos), vtype=po.VariantType.SNV, freqs=freqs, alleles=BASES, ref_allele=BASES[ref_id]) else: variant = None self._snv_file.close() return variant
def _read_indel(self) -> po.VariantOccurrence: if self._indel_count > 0: self._indel_count -= 1 index, counts, seqs = Vac.read_indel_record(self._indel_file) ref_name, ref_pos = self._fai.index2pos(index) variant = po.VariantOccurrence(position=po.GenomicPosition( index, ref_name, ref_pos), vtype=po.VariantType.INDEL, freqs=counts, alleles=seqs, ref_allele=seqs[0]) else: variant = None self._indel_file.close() return variant
def __init__(self, vac_filename: str, fai: FastaIndex): """ :param vac_filename: :param fai: does not affect bases listed in VAC file """ with open(vac_filename, 'rb') as vac_file: # read header self._snv_count, self._indel_count = Vac.read_header(vac_file) self._snv_file = open(vac_filename, 'rb') self._snv_file.seek(Vac.HEADER_SIZE) self._indel_file = open(vac_filename, 'rb') self._indel_file.seek(Vac.HEADER_SIZE + self._snv_count * Vac.SNV_RECORD_SIZE) self._fai = fai self._counter = 0 # init iteration self._snv_variant = self._read_snv() self._indel_variant = self._read_indel()
def test_is_indel(self): self.assertFalse(Vac.is_indel(['A', 'T'])) self.assertFalse(Vac.is_indel(['A', 'T', 'G', 'C'])) self.assertTrue(Vac.is_indel(['A', 'TT', 'G', 'C'])) self.assertTrue(Vac.is_indel(['A', 'T', 'GG', 'CC'])) self.assertTrue(Vac.is_indel(['AA', 'T', 'G', 'C'])) self.assertTrue(Vac.is_indel(['AA', 'TT', 'GG', 'CC'])) self.assertFalse(Vac.is_indel(['AA', '.'])) self.assertFalse(Vac.is_indel(['AA', '*'])) self.assertFalse(Vac.is_indel(['AA', '*', 'T'])) self.assertFalse(Vac.is_indel(['AA', 'T', '*'])) self.assertFalse(Vac.is_indel(['A', '*', 'TT'])) self.assertFalse(Vac.is_indel(['A', 'TT', '*'])) self.assertFalse(Vac.is_indel(['N', 'AA'])) self.assertFalse(Vac.is_indel(['AA', 'N'])) self.assertFalse(Vac.is_indel(['AA', 'T', 'N'])) self.assertFalse(Vac.is_indel(['AA', 'N', 'T'])) self.assertFalse(Vac.is_indel(['A', 'TT', 'N'])) self.assertFalse(Vac.is_indel(['A', 'N', 'TT'])) self.assertFalse(Vac.is_indel(['A', 'TN', 'G'])) self.assertFalse(Vac.is_indel(['A', 'G', 'TN'])) self.assertFalse(Vac.is_indel(['AA', '<CN1>', '<CN2>'])) self.assertFalse(Vac.is_indel(['AA', '<INS:ME:ALU>']))
def test_parse_an(self): self.assertEqual(5008, Vac.parse_an(['AC=5,3', 'AN=5008', 'NS=2504']))
def test_parse_ac(self): self.assertTupleEqual((5, 3), Vac.parse_ac(['AC=5,3', 'AN=5008', 'NS=2504']))
def __build_vac_file(cls): vac_file = io.BytesIO() Vac.write_header(vac_file, 3, 3) Vac._write_snv_record(vac_file, 2000000000, 0, (3, 2, 1, 0)) Vac._write_snv_record(vac_file, 2000000002, 1, (0, 1, 2, 3)) Vac._write_snv_record(vac_file, 2000000004, 2, (1, 1, 1, 1)) Vac._write_indel_record(vac_file, 2000000001, ((10, 'A'), (1, 'ATCG'))) Vac._write_indel_record(vac_file, 2000000003, ((10, 'AT'), (1, 'ATCGT'))) Vac._write_indel_record(vac_file, 2000000005, ((10, 'AAAA'), (10, 'ATCG'), (0, 'A'))) vac_file.seek(0) return vac_file
def vac(cls): with open_bam(cls.RESOURCE_PATH + 'input.sam', 'rb') as sam_file: return Vac(FastaIndex.from_bam(sam_file))