def uncorr_and_corr_with_rg(tmp_path): uncorr_fastq = tmp_path / 'uncorr_withrg.fq' corrected_fastq = tmp_path / 'corr_withrg.fq' with open(uncorr_fastq,'w') as fu, open(corrected_fastq,'w') as fc: r = pysam.FastxRecord( name = 'foo/1_RG:Z:bar', sequence = 'ATG', quality = '((#') #7, 7, 2 r2 = pysam.FastxRecord( name = r.name, sequence = 'ACG', quality = r.quality) fu.write(str(r)) fc.write(str(r2)) return str(uncorr_fastq), str(corrected_fastq)
def test_createProbesForGeneVariants_emptyVariants_returnEmptyProbes(self): query = Query(TEST_QUERY_VCF, TEST_QUERY_REF) expected = "" actual = query._create_probes_for_gene_variants(pysam.FastxRecord(), []) assert actual == expected
def compress_seq(read): """Compress homopolymers within a basecall. Creates and RLE sequence record, with run-lengths stored in qualities. :param read: `pysam FastxRecord` object. :returns: `pysam FastxRecord` """ logger = medaka.common.get_named_logger('Compress_basecalls') # Phred qscores `!"#$%&`... scores = ''.join(chr(x) for x in range(33, 127)) rle_compressed = RLEConverter(read.sequence) # we can only encode up to a homopolymer length 93 inds = np.where(rle_compressed.homop_length >= len(scores))[0] if len(inds) > 0: logger.warning("Some homopolymers in {} are longer than the longest " "supported length\n".format(read.name)) rle_compressed.homop_length[inds] = len(scores) - 1 coded_lengths = ''.join([scores[x] for x in rle_compressed.homop_length]) compressed_record = pysam.FastxRecord( name=read.name, comment=read.comment if read.comment is not None else '', sequence=rle_compressed.compact_basecall, quality=coded_lengths) return compressed_record
def test_fastx_record_can_be_created_from_scratch(self): fastx_record = pysam.FastxRecord() self.assertRaises(ValueError, str, fastx_record) fastx_record.set_name("name") self.assertRaises(ValueError, str, fastx_record) fastx_record.set_sequence("sequence") self.assertEqual(str(fastx_record), ">name\nsequence")
def test_find_corrected_sites(simple_fastq_reads): for r in simple_fastq_reads: r2 = pysam.FastxRecord(name = r.name, sequence = r.sequence, quality = r.quality) edited_seq = list(r2.sequence) edited_seq[5] = 'C' r2.sequence = ''.join(edited_seq) correct = np.zeros(len(edited_seq), dtype = np.bool) correct[5] = True assert np.array_equal(recalibrate.find_corrected_sites(r,r2), correct)
def test_compress_read(self): """Given an input record, check the returned RLE compressed version.""" read = pysam.FastxRecord( name='test', comment='runid=b81', sequence='ACCGTTTAC') compressed_read = medaka.rle.compress_seq(read) true_output = { 'name': read.name, 'comment': read.comment, 'sequence': 'ACGTAC', 'quality': '"#"$""'} for key, expected in true_output.items(): got = getattr(compressed_read, key) self.assertEqual(got, expected)
def test_recalibrate_fastq(): read = pysam.FastxRecord(name='foo', sequence='ATG', quality='((#') #7, 7, 2 meanq = np.array([10]) globaldeltaq = np.array([1]) qscoredeltaq = np.array([[2, 2, 2, 2, 2, 2, 2, 2]]) positiondeltaq = np.zeros((1, 8, 6)) positiondeltaq[0, 7, :] = 3 dinucdeltaq = np.zeros([1, 8, 16]) dinucdeltaq[0, 7, :] = 5 assert np.array_equal( compare_reads.recalibrate_fastq( read, meanq, globaldeltaq, qscoredeltaq, positiondeltaq, dinucdeltaq, np.array([0]), compare_reads.Dinucleotide.dinuc_to_int), np.array([21, 21, 2]))
def _create_probes_for_gene_variants( self, gene: pysam.FastxRecord, variants: pysam.tabix_iterator) -> str: """Note: An assumption is made with this function that the variants you pass in are from the gene passed with them.""" probes = "" variants = [ entry for entry in variants if not is_invalid_vcf_entry(entry) ] intervals = [ self.calculate_probe_boundaries_for_entry(variant) for variant in variants ] intervals_to_probes = dict() for variant in variants: interval = self.calculate_probe_boundaries_for_entry(variant) if interval in intervals_to_probes and float( intervals_to_probes[interval].name.split( "=")[-1]) > get_genotype_confidence(variant): continue mutated_consensus = "" consensus = gene.sequence[slice(*interval)] last_idx = 0 start_idx_of_variant_on_consensus = variant.start - interval[0] mutated_consensus += consensus[ last_idx:start_idx_of_variant_on_consensus] mutated_consensus += get_variant_sequence(variant) last_idx = start_idx_of_variant_on_consensus + variant.rlen mutated_consensus += consensus[last_idx:] probe = pysam.FastxRecord() probe.set_name( f"{variant.chrom}_POS={variant.pos}_interval={interval}_GT_CONF={get_genotype_confidence(variant)}" .replace(" ", "")) probe.set_sequence(mutated_consensus) intervals_to_probes[interval] = probe for probe in intervals_to_probes.values(): probes += str(probe) + "\n" return probes
def test_mapProbesToPanel_oneRecordFirstBaseIsVariantSite(): probe = pysam.FastxRecord() name = "GC00004785_pos168_entry0_CONF123.45" probe.set_name(name) probe.set_sequence("GACCTACACCGACGCCAAAGGCGAAAAACGCCCAATGTACCAAATCACCAAAAAC") panel = Path(TEST_PANEL) actual = map_probes_to_panel(str(probe), panel) expected = { "snps_called_correctly": [True], "mismatches": [0], "ids": [name], "ref_ids": ["T16509G"], "total_pandora_calls": 1, "pandora_calls_crossing_ref_site": 1, "reference_sites_called": 1, } assert actual == expected
def test_mapProbesToPanel_oneRecordDoesntMap(): probe = pysam.FastxRecord() name = "GC00004785_pos168_entry0_CONF123.45" probe.set_name(name) probe.set_sequence("T" * 60) panel = Path(TEST_PANEL) actual = map_probes_to_panel(str(probe), panel) expected = { "snps_called_correctly": [], "mismatches": [], "ids": [], "ref_ids": [], "total_pandora_calls": 1, "pandora_calls_crossing_ref_site": 0, "reference_sites_called": 0, } assert actual == expected
def test_mapProbesToPanel_oneRecordSnpNotCalledNoOtherMismatches(): probe = pysam.FastxRecord() name = "GC00004785_pos168_entry0_CONF123.45" probe.set_name(name) probe.set_sequence("TTAACGCCCTCAATTTTGAGGACGTAACCTACACCGACGCCAAAGGCGAA") panel = Path(TEST_PANEL) actual = map_probes_to_panel(str(probe), panel) expected = { "snps_called_correctly": [False], "mismatches": [1], "ids": [name], "ref_ids": ["T16509G"], "total_pandora_calls": 1, "pandora_calls_crossing_ref_site": 1, "reference_sites_called": 1, } assert actual == expected
def test_mapProbesToPanel_oneRecordSnpCalledTwoMismatches(): probe = pysam.FastxRecord() name = "GC00004785_pos168_entry0_CONF123.45" probe.set_name(name) probe.set_sequence("ACGTCGTGAGCAGGATATAAAAGCATTACGCCCACAAATCTATGCTCCCA") panel = Path(TEST_PANEL) actual = map_probes_to_panel(str(probe), panel) expected = { "snps_called_correctly": [True], "mismatches": [2], "ids": [name], "ref_ids": ["C15154T"], "total_pandora_calls": 1, "pandora_calls_crossing_ref_site": 1, "reference_sites_called": 1, } assert actual == expected
def test_mapProbesToPanel_oneRecordMapsToPanelButToRightOfVariant(): probe = pysam.FastxRecord() name = "GC00004785_pos168_entry0_CONF123.45" probe.set_name(name) probe.set_sequence( "ACCTACACCGACGCCAAAGGCGAAAAACGCCCAATGTACCAAATCACCAAAAACGGCTTCGTCTTCCTGGTGATGGGATTCACT" ) panel = Path(TEST_PANEL) actual = map_probes_to_panel(str(probe), panel) expected = { "snps_called_correctly": [], "mismatches": [], "ids": [], "ref_ids": [], "total_pandora_calls": 1, "pandora_calls_crossing_ref_site": 0, "reference_sites_called": 0, } assert actual == expected
np.array([[0,0,0,0,0,0,0,2] + [0] * 35]), #q correct_pos_errs, #pos correct_pos_total, #pos correct_dinuc_errs, #dinuc correct_dinuc_total] #diunc for a,b in zip(correct_vectors, recalibrate.fastq_to_covariate_arrays( uncorr_and_corr_fastq_files)): assert np.array_equal(a,b) for a,b in zip(correct_vectors, recalibrate.fastq_to_covariate_arrays( uncorr_and_corr_with_rg, infer_rg = True)): assert np.array_equal(a,b) #this read is used below correct_read = pysam.FastxRecord( name = 'foo', sequence = 'ATG', quality = '\'\'#') #6, 6, 2 correct_read_with_rg = pysam.FastxRecord( name = 'foo/1_RG:Z:bar', sequence = 'ATG', quality = '\'\'#') def test_recalibrate_fastq(uncorr_and_corr_fastq_files, uncorr_and_corr_with_rg, capfd): recalibrate.recalibrate_fastq(uncorr_and_corr_fastq_files) captured = capfd.readouterr() assert captured.out == str(correct_read) + '\n' #now test with infer_rg = True recalibrate.recalibrate_fastq(uncorr_and_corr_with_rg, infer_rg = True) captured = capfd.readouterr()
r2_out_file = os.path.join(outdir, prefix + "_R2.fastq") r3_out_file = os.path.join(outdir, prefix + "_R3.fastq") start_time = time.time() print("Start to pre-process fastq file", time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())) with pysam.FastxFile(r1_in_file) as r1_in, pysam.FastxFile( r2_in_file) as r2_in, open(r1_out_file, "w") as r1_out, open( r2_out_file, "w") as r2_out, open(r3_out_file, "w") as r3_out: for entry in r1_in: r3_entry = next(r2_in) name = entry.name sequence = entry.sequence comment = entry.comment quality = entry.quality barcode = name.split(":")[1].split("-")[-1] barcode_entry = pysam.FastxRecord() # write r1 r1_out.write(str(entry) + "\n") # write r2 barcode_entry.name = name barcode_entry.sequence = barcode barcode_entry.comment = comment barcode_entry.quality = quality[0:len(barcode)] r2_out.write(str(barcode_entry) + "\n") # write r3 r3_out.write(str(r3_entry) + "\n") end_time = time.time() print("End", end_time - start_time)