def test_realigned_read_counting(self): for record in vcf.Reader(self.vcf_file): self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam") sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), input_bam) fragments = npsva.RealignedFragments( self.input_fasta, sample.mean_insert_size, sample.std_insert_size, sample.insert_size_density().as_dict(), input_bam, ) fragments.gather_reads(variant.region_string(flank=self.args.flank)) self.assertEqual(fragments.size(), 254) ref_contig = "1_2073761_2073846_DEL" alt_contig = "1_2073761_2073846_DEL_alt" rl_breakpoint = f"{ref_contig}:{self.args.flank}-{self.args.flank+1}" al_breakpoint = f"{alt_contig}:{self.args.flank}-{self.args.flank+1}" ref_length = variant.ref_length rr_breakpoint = f"{ref_contig}:{self.args.flank + ref_length - 1}-{self.args.flank + ref_length}" counts, read_names = fragments.count_realigned_reads([(rl_breakpoint, rr_breakpoint, al_breakpoint, "")]) self.assertEqual(counts["al"], 18.0) self.assertEqual((counts["rl"] + counts["rr"]) / 2, 4.0) for bp in ("rl", "rr", "al", "rl"): self.assertEqual(len(read_names[bp]), counts[bp])
def setUp(self): self.vcf_file = io.StringIO("""##fileformat=VCFv4.1 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> ##ALT=<ID=DEL,Description="Deletion"> #CHROM POS ID REF ALT QUAL FILTER INFO 1 2073761 . CAGCAGCCGAAGCGCCTCCTTTCAATCCAGGGTCCACACATCCAGCAGCCGAAGCGCCCTCCTTTCAATCCAGGGTCCAGGCATCT C . PASS SVTYPE=DEL;END=2073846;SVLEN=-85 """) self.args = argparse.Namespace(flank=3000, min_anchor=11, default_ci=10, min_mapq=40, min_baseq=15, rel_coverage_flank=1000, count_straddle=True, min_clip=4, mapq_reads=False, reference=None) self.input_fasta = os.path.join(FILE_DIR, "1_2073761_2073846_DEL.synth.fasta") self.input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam") self.sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), self.input_bam) patcher = patch.object( Variant, "reference_sequence", return_value= "AGCAGCCGAAGCGCCTCCTTTCAATCCAGGGTCCACACATCCAGCAGCCGAAGCGCCCTCCTTTCAATCCAGGGTCCAGGCATCT" ) self.mock_reference_sequence = patcher.start() self.addCleanup(patcher.stop)
def test_pipeline_straddle_counting(self): for record in vcf.Reader(self.vcf_file): self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam") sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), input_bam) fragments = npsva.RealignedFragments( self.input_fasta, sample.mean_insert_size, sample.std_insert_size, sample.insert_size_density().as_dict(), input_bam, ) fragments.gather_reads(variant.region_string(flank=self.args.flank)) self.assertEqual(fragments.size(), 254) left_breakpoint = variant.left_flank_region_string(left_flank=1, right_flank=1) right_breakpoint = variant.right_flank_region_string(left_flank=1, right_flank=1) pair_results = fragments.count_pipeline_straddlers( left_breakpoint, right_breakpoint, self.args.flank, -variant.event_length, 1.5, 10, ) self.assertAlmostEqual(pair_results["alt_weighted_count"], 13.496, places=1) self.assertAlmostEqual(pair_results["insert_lower"], 0.0, places=2) self.assertAlmostEqual(pair_results["insert_upper"] / pair_results["insert_count"], 0.166, places=2)
def setUp(self): self.tempdir = tempfile.TemporaryDirectory() self.args = argparse.Namespace(flank=1, tempdir=self.tempdir.name, mapq_reads=False) self.input_bam = "dummy.bam" self.sample = Sample.from_npsv( os.path.join(FILE_DIR, "stats.json"), self.input_bam )
def filter_reads_gc(args, stats_path: str, fasta_path: str, in_sam: str, out_fastq: str): sample = Sample.from_npsv(stats_path) # Construct dictionary of GC normalized coverage library = sample.get_library(None) gc_covg = np.fromiter((library.gc_normalized_coverage(gc) for gc in np.linspace(0.0, 1.0, 101)), dtype=float) max_normalized_gc = min(np.max(gc_covg), args.max_gc_norm_covg) gc_covg /= max_normalized_gc npsva.filter_reads_gc(fasta_path, in_sam, out_fastq, gc_covg)
def test_loads_npsv_json(self): json_path = os.path.join(FILE_DIR, "stats.json") bam_path = os.path.join(FILE_DIR, "1_1598414_1598580_DEL.bam") ped_path = os.path.join(FILE_DIR, "trio.ped") # Override the BAM path to be specific to test file sample_object = Sample.from_npsv(json_path, bam_path=bam_path, ped_path=ped_path) self.assertTrue(sample_object.has_read_group("NA12878")) self.assertEqual(sample_object.name, "HG002") self.assertAlmostEqual(sample_object.mean_coverage, 25.46, places=1) self.assertEqual(sample_object.gender, 1) # Get generic library library_object = sample_object.get_library("HG002") self.assertIsNotNone(library_object) self.assertEqual(library_object.read_length, 148) self.assertAlmostEqual(library_object.mean_insert_size, 573.060562, places=1) self.assertAlmostEqual(library_object.std_insert_size, 164.215239, places=1) self.assertEqual( library_object.insert_size_density[10000], norm.pdf(10000, loc=573.060562, scale=164.215239), ) # Compute search distance self.assertGreater(sample_object.search_distance(percentile=0.99), norm.ppf(0.99, library_object.mean_insert_size, library_object.std_insert_size)) # Compute the normalized GC coverage self.assertAlmostEqual( library_object.gc_normalized_coverage(0.40), 1.110604, places=3 ) self.assertAlmostEqual( library_object.gc_normalized_coverage(0.4012345), 1.110604, places=3 ) # Find the maximum GC normalization factor max_gc = sample_object.max_gc_normalized_coverage(limit=2.0) self.assertGreaterEqual(max_gc, 1.0) self.assertLessEqual(max_gc, 2.0) max_gc = sample_object.max_gc_normalized_coverage(limit=2.0) print(max_gc)
def test_pipeline_clip_counting(self): try: # Create SAM file with a single read with tempfile.NamedTemporaryFile( mode="w", delete=False, suffix=".sam" ) as sam_file: # fmt: off print("@HD", "VN:1.3", "SO:coordinate", sep="\t", file=sam_file) print("@SQ", "SN:1", "LN:6070", sep="\t", file=sam_file) print("@RG", "ID:synth1", "LB:synth1", "PL:illumina", "PU:ART", "SM:HG002", sep="\t", file=sam_file) print( "ref-82", "99", "1", "91", "99", "148=", "=", "679", "732", "GATGAGCGAGAGCCGCCAGACCCACGTGACGCTGCACGACATCGACCCTCAGGCCTTGGACCAGCTGGTGCAGTTTGCCTACACGGCTGAGATTGTGGTGGGCGAGGGCAATGTGCAGGTGAGGGCTCCCTCACCCGGATCCCGGTGT", "CCCGGGGGGGGGG=CGJJGCJJJJJJJJJGJJJGCGGJJJJJJGJJGJCG8GGJGJJJGGCGCJGCCJCCGGG81GGCGGGGCCCGGCGGGGGGGC=GCCGGCGGCCGGGCCGGGC8CGGGCCC=GGCGGGGGGGGGGGCGGGGGGCG", sep="\t", file=sam_file, ) print( "ref-82", "147", "1", "679", "99", "4S140=4H", "=", "91", "-732", "CCTGACTCTGCTCGGCCCCTCCCAGTATGAACACTCAGCCCCCACCTGCTAACCCTCCCTCCTAGGCATCTTCAGGGCTCCCTGGGTCCACAGGACCCTCCCCAGATCTCAGGTCTGAGGACCCCCACTCCCAGGTTCTGGAAC", "CCGGCGGGGCCCGCC=GGGG8CG8GGGC8CCGGGGGGGGGGGGGCCC(JGG1CGGGGCGGCCGC8GGGCGGGGGCCGGGJCGGG(CJ=JGJJGJJGGJJGGJJGGGJGJJGJJGJJGJGCCCGJGJJGJJJJJGJGGCG1GGGG", sep="\t", file=sam_file, ) # fmt: on sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), sam_file.name) fragments = npsva.RealignedFragments( self.input_fasta, sample.mean_insert_size, sample.std_insert_size, sample.insert_size_density().as_dict(), sam_file.name, ) fragments.gather_reads("1:1-1000") self.assertEqual(fragments.size(), 1) clip_results = fragments.count_pipeline_clipped_reads("1:100-101", 4) self.assertDictEqual(clip_results, {"left": 0, "right": 0, "both": 0, "total": 1}) clip_results = fragments.count_pipeline_clipped_reads("1:700-701", 4) self.assertDictEqual(clip_results, {"left": 0, "right": 0, "both": 1, "total": 1}) finally: os.remove(sam_file.name)
def main(): parser = make_argument_parser() args = parser.parse_args() logging.basicConfig(level=args.loglevel) # Create any directories that are needed logging.info( f"Creating {args.output} output and {args.tempdir} temporary directories if they don't exist" ) os.makedirs(args.output, exist_ok=True) os.makedirs(args.tempdir, exist_ok=True) # Initialize parallel computing setup ray.init(num_cpus=args.threads, _temp_dir=args.tempdir, include_dashboard=False) # TODO: If library is not specified compute statistics, i.e. mean insert size, tec. if args.stats_path is not None: logging.info("Extracting BAM stats from NPSV stats file") sample = Sample.from_npsv(args.stats_path, bam_path=args.bam, ped_path=args.ped_path) elif None not in ( args.fragment_mean, args.fragment_sd, args.read_length, args.depth, ): logging.info("Using Normal distribution for BAM stats") sample = Sample.from_distribution( args.bam, args.fragment_mean, args.fragment_sd, args.read_length, mean_coverage=args.depth, ) else: raise parser.error( "Library information needed. Either provide distribution parameters or run `npsvg preprocess` to generate stats file." ) # Select directory for variant files if args.keep_synth_bams: variant_dir = args.output else: variant_dir = args.tempdir # For each variant generate synthetic bam file(s) and extract relevant evidence observed_variants = {} record_results = [] vcf_reader = vcf.Reader(filename=args.input) for i, record in enumerate(tqdm(vcf_reader, desc="Preparing variants")): variant = Variant.from_pyvcf(record, args.reference) # npsv currently only supports deletions if variant is None: continue # NPSV currently does not support variants with duplicate start and end coordinates description = variant_descriptor(record) if observed_variants.setdefault(description, i) != i: logging.warning("Skipping variant with duplicate description %s", description) continue # Construct single variant VCF outside of worker so we don't need to pass the reader into the thread variant_vcf_path = os.path.join(variant_dir, description + ".vcf") if not args.reuse or not os.path.exists(variant_vcf_path + ".gz"): variant_vcf_path = write_record_to_indexed_vcf( record, vcf_reader, variant_vcf_path) else: # Variant file already exists, no need to recreate variant_vcf_path += ".gz" record_results.append( simulate_and_extract.remote(args, sample, variant, variant_vcf_path, description)) # Concatenate output files to create feature files sim_tsv_path = os.path.join(args.output, args.prefix + ".sim.tsv") real_tsv_path = os.path.join(args.output, args.prefix + ".real.tsv") logging.info("Extracting features (to %s and %s)", sim_tsv_path, real_tsv_path) with open(sim_tsv_path, "w") as file: Features.header(out_file=file, ac=True) with open(real_tsv_path, "w") as file: Features.header(out_file=file, ac=False) with open(sim_tsv_path, "ab") as sim_sink, open(real_tsv_path, "ab") as real_sink: for sim_result, real_result in tqdm( ray_iterator(record_results), total=len(record_results), desc="Extracting features", ): with open(sim_result, "rb") as source: shutil.copyfileobj(source, sim_sink) sim_sink.flush() with open(real_result, "rb") as source: shutil.copyfileobj(source, real_sink) real_sink.flush() # Perform genotyping with open(os.path.join(args.output, args.prefix + ".npsv.vcf"), "w") as gt_vcf_file: logging.info("Determining genotypes (output in %s)", gt_vcf_file.name) genotyping_args = argparse.Namespace(**vars(args)) genotype_vcf( genotyping_args, args.input, sim_tsv_path, real_tsv_path, gt_vcf_file, samples=[sample.name], )
def test_handles_missing_gender(self): json_path = os.path.join(FILE_DIR, "stats.json") bam_path = os.path.join(FILE_DIR, "1_1598414_1598580_DEL.bam") sample_object = Sample.from_npsv(json_path, bam_path=bam_path) self.assertEqual(sample_object.gender, 0)