Example #1
0
    def test_realigned_read_counting(self):
        for record in vcf.Reader(self.vcf_file):
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)

            input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam")
            sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), input_bam)

            fragments = npsva.RealignedFragments(
                self.input_fasta,
                sample.mean_insert_size,
                sample.std_insert_size,
                sample.insert_size_density().as_dict(),
                input_bam,
            )
            fragments.gather_reads(variant.region_string(flank=self.args.flank))
            self.assertEqual(fragments.size(), 254)

            ref_contig = "1_2073761_2073846_DEL"
            alt_contig = "1_2073761_2073846_DEL_alt"
            
            rl_breakpoint = f"{ref_contig}:{self.args.flank}-{self.args.flank+1}"
            al_breakpoint = f"{alt_contig}:{self.args.flank}-{self.args.flank+1}"
            ref_length = variant.ref_length    
            rr_breakpoint = f"{ref_contig}:{self.args.flank + ref_length - 1}-{self.args.flank + ref_length}"

            counts, read_names = fragments.count_realigned_reads([(rl_breakpoint, rr_breakpoint, al_breakpoint, "")])            
            self.assertEqual(counts["al"], 18.0)
            self.assertEqual((counts["rl"] + counts["rr"]) / 2, 4.0)
            for bp in ("rl", "rr", "al", "rl"):
                self.assertEqual(len(read_names[bp]), counts[bp])
Example #2
0
    def setUp(self):
        self.vcf_file = io.StringIO("""##fileformat=VCFv4.1
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##ALT=<ID=DEL,Description="Deletion">
#CHROM POS ID REF ALT QUAL FILTER INFO
1 2073761 . CAGCAGCCGAAGCGCCTCCTTTCAATCCAGGGTCCACACATCCAGCAGCCGAAGCGCCCTCCTTTCAATCCAGGGTCCAGGCATCT C . PASS SVTYPE=DEL;END=2073846;SVLEN=-85
""")
        self.args = argparse.Namespace(flank=3000,
                                       min_anchor=11,
                                       default_ci=10,
                                       min_mapq=40,
                                       min_baseq=15,
                                       rel_coverage_flank=1000,
                                       count_straddle=True,
                                       min_clip=4,
                                       mapq_reads=False,
                                       reference=None)
        self.input_fasta = os.path.join(FILE_DIR,
                                        "1_2073761_2073846_DEL.synth.fasta")
        self.input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam")
        self.sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"),
                                       self.input_bam)

        patcher = patch.object(
            Variant,
            "reference_sequence",
            return_value=
            "AGCAGCCGAAGCGCCTCCTTTCAATCCAGGGTCCACACATCCAGCAGCCGAAGCGCCCTCCTTTCAATCCAGGGTCCAGGCATCT"
        )
        self.mock_reference_sequence = patcher.start()
        self.addCleanup(patcher.stop)
Example #3
0
    def test_pipeline_straddle_counting(self):
        for record in vcf.Reader(self.vcf_file):
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)

            input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam")
            sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), input_bam)

            fragments = npsva.RealignedFragments(
                self.input_fasta,
                sample.mean_insert_size,
                sample.std_insert_size,
                sample.insert_size_density().as_dict(),
                input_bam,
            )
            fragments.gather_reads(variant.region_string(flank=self.args.flank))
            self.assertEqual(fragments.size(), 254)

            left_breakpoint = variant.left_flank_region_string(left_flank=1, right_flank=1)
            right_breakpoint = variant.right_flank_region_string(left_flank=1, right_flank=1)
            pair_results = fragments.count_pipeline_straddlers(
                left_breakpoint, right_breakpoint, self.args.flank, -variant.event_length, 1.5, 10,
            )
            self.assertAlmostEqual(pair_results["alt_weighted_count"], 13.496, places=1)
            self.assertAlmostEqual(pair_results["insert_lower"], 0.0, places=2)
            self.assertAlmostEqual(pair_results["insert_upper"] / pair_results["insert_count"], 0.166, places=2)
Example #4
0
 def setUp(self):
     self.tempdir = tempfile.TemporaryDirectory()
     self.args = argparse.Namespace(flank=1, tempdir=self.tempdir.name, mapq_reads=False)
     self.input_bam = "dummy.bam"
     self.sample = Sample.from_npsv(
         os.path.join(FILE_DIR, "stats.json"), self.input_bam
     )
Example #5
0
def filter_reads_gc(args, stats_path: str, fasta_path: str, in_sam: str,
                    out_fastq: str):
    sample = Sample.from_npsv(stats_path)

    # Construct dictionary of GC normalized coverage
    library = sample.get_library(None)

    gc_covg = np.fromiter((library.gc_normalized_coverage(gc)
                           for gc in np.linspace(0.0, 1.0, 101)),
                          dtype=float)
    max_normalized_gc = min(np.max(gc_covg), args.max_gc_norm_covg)
    gc_covg /= max_normalized_gc

    npsva.filter_reads_gc(fasta_path, in_sam, out_fastq, gc_covg)
Example #6
0
    def test_loads_npsv_json(self):
        json_path = os.path.join(FILE_DIR, "stats.json")
        bam_path = os.path.join(FILE_DIR, "1_1598414_1598580_DEL.bam")
        ped_path = os.path.join(FILE_DIR, "trio.ped")

        # Override the BAM path to be specific to test file
        sample_object = Sample.from_npsv(json_path, bam_path=bam_path, ped_path=ped_path)
        self.assertTrue(sample_object.has_read_group("NA12878"))
        self.assertEqual(sample_object.name, "HG002")
        self.assertAlmostEqual(sample_object.mean_coverage, 25.46, places=1)
        self.assertEqual(sample_object.gender, 1)

        # Get generic library
        library_object = sample_object.get_library("HG002")
        self.assertIsNotNone(library_object)

        self.assertEqual(library_object.read_length, 148)
        self.assertAlmostEqual(library_object.mean_insert_size, 573.060562, places=1)
        self.assertAlmostEqual(library_object.std_insert_size, 164.215239, places=1)

        self.assertEqual(
            library_object.insert_size_density[10000],
            norm.pdf(10000, loc=573.060562, scale=164.215239),
        )

        # Compute search distance
        self.assertGreater(sample_object.search_distance(percentile=0.99), norm.ppf(0.99, library_object.mean_insert_size, library_object.std_insert_size))

        # Compute the normalized GC coverage
        self.assertAlmostEqual(
            library_object.gc_normalized_coverage(0.40), 1.110604, places=3
        )
        self.assertAlmostEqual(
            library_object.gc_normalized_coverage(0.4012345), 1.110604, places=3
        )

        # Find the maximum GC normalization factor
        max_gc = sample_object.max_gc_normalized_coverage(limit=2.0)
        self.assertGreaterEqual(max_gc, 1.0)
        self.assertLessEqual(max_gc, 2.0)

        max_gc = sample_object.max_gc_normalized_coverage(limit=2.0)
        print(max_gc)
Example #7
0
    def test_pipeline_clip_counting(self):
        try:
            # Create SAM file with a single read
            with tempfile.NamedTemporaryFile(
                mode="w", delete=False, suffix=".sam"
            ) as sam_file:
                # fmt: off
                print("@HD", "VN:1.3", "SO:coordinate", sep="\t", file=sam_file)
                print("@SQ", "SN:1", "LN:6070", sep="\t", file=sam_file)
                print("@RG", "ID:synth1", "LB:synth1", "PL:illumina", "PU:ART", "SM:HG002", sep="\t", file=sam_file)
                print(
                    "ref-82", "99", "1", "91", "99", "148=", "=", "679", "732",
                    "GATGAGCGAGAGCCGCCAGACCCACGTGACGCTGCACGACATCGACCCTCAGGCCTTGGACCAGCTGGTGCAGTTTGCCTACACGGCTGAGATTGTGGTGGGCGAGGGCAATGTGCAGGTGAGGGCTCCCTCACCCGGATCCCGGTGT",
                    "CCCGGGGGGGGGG=CGJJGCJJJJJJJJJGJJJGCGGJJJJJJGJJGJCG8GGJGJJJGGCGCJGCCJCCGGG81GGCGGGGCCCGGCGGGGGGGC=GCCGGCGGCCGGGCCGGGC8CGGGCCC=GGCGGGGGGGGGGGCGGGGGGCG",
                    sep="\t", file=sam_file,
                )
                print(
                    "ref-82", "147", "1", "679", "99", "4S140=4H", "=", "91", "-732",
                    "CCTGACTCTGCTCGGCCCCTCCCAGTATGAACACTCAGCCCCCACCTGCTAACCCTCCCTCCTAGGCATCTTCAGGGCTCCCTGGGTCCACAGGACCCTCCCCAGATCTCAGGTCTGAGGACCCCCACTCCCAGGTTCTGGAAC",
                    "CCGGCGGGGCCCGCC=GGGG8CG8GGGC8CCGGGGGGGGGGGGGCCC(JGG1CGGGGCGGCCGC8GGGCGGGGGCCGGGJCGGG(CJ=JGJJGJJGGJJGGJJGGGJGJJGJJGJJGJGCCCGJGJJGJJJJJGJGGCG1GGGG",
                    sep="\t", file=sam_file,
                )
                # fmt: on

            sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), sam_file.name)
            fragments = npsva.RealignedFragments(
                self.input_fasta,
                sample.mean_insert_size,
                sample.std_insert_size,
                sample.insert_size_density().as_dict(),
                sam_file.name,
            )
            fragments.gather_reads("1:1-1000")
            self.assertEqual(fragments.size(), 1)
            
            clip_results = fragments.count_pipeline_clipped_reads("1:100-101", 4)
            self.assertDictEqual(clip_results, {"left": 0, "right": 0, "both": 0, "total": 1})

            clip_results = fragments.count_pipeline_clipped_reads("1:700-701", 4)
            self.assertDictEqual(clip_results, {"left": 0, "right": 0, "both": 1, "total": 1})

        finally:
            os.remove(sam_file.name)
Example #8
0
def main():
    parser = make_argument_parser()
    args = parser.parse_args()

    logging.basicConfig(level=args.loglevel)

    # Create any directories that are needed
    logging.info(
        f"Creating {args.output} output and {args.tempdir} temporary directories if they don't exist"
    )
    os.makedirs(args.output, exist_ok=True)
    os.makedirs(args.tempdir, exist_ok=True)

    # Initialize parallel computing setup
    ray.init(num_cpus=args.threads,
             _temp_dir=args.tempdir,
             include_dashboard=False)

    # TODO: If library is not specified compute statistics, i.e. mean insert size, tec.
    if args.stats_path is not None:
        logging.info("Extracting BAM stats from NPSV stats file")
        sample = Sample.from_npsv(args.stats_path,
                                  bam_path=args.bam,
                                  ped_path=args.ped_path)
    elif None not in (
            args.fragment_mean,
            args.fragment_sd,
            args.read_length,
            args.depth,
    ):
        logging.info("Using Normal distribution for BAM stats")
        sample = Sample.from_distribution(
            args.bam,
            args.fragment_mean,
            args.fragment_sd,
            args.read_length,
            mean_coverage=args.depth,
        )
    else:
        raise parser.error(
            "Library information needed. Either provide distribution parameters or run `npsvg preprocess` to generate stats file."
        )

    # Select directory for variant files
    if args.keep_synth_bams:
        variant_dir = args.output
    else:
        variant_dir = args.tempdir

    # For each variant generate synthetic bam file(s) and extract relevant evidence
    observed_variants = {}
    record_results = []
    vcf_reader = vcf.Reader(filename=args.input)
    for i, record in enumerate(tqdm(vcf_reader, desc="Preparing variants")):
        variant = Variant.from_pyvcf(record, args.reference)
        # npsv currently only supports deletions
        if variant is None:
            continue

        # NPSV currently does not support variants with duplicate start and end coordinates
        description = variant_descriptor(record)
        if observed_variants.setdefault(description, i) != i:
            logging.warning("Skipping variant with duplicate description %s",
                            description)
            continue

        # Construct single variant VCF outside of worker so we don't need to pass the reader into the thread
        variant_vcf_path = os.path.join(variant_dir, description + ".vcf")
        if not args.reuse or not os.path.exists(variant_vcf_path + ".gz"):
            variant_vcf_path = write_record_to_indexed_vcf(
                record, vcf_reader, variant_vcf_path)
        else:
            # Variant file already exists, no need to recreate
            variant_vcf_path += ".gz"

        record_results.append(
            simulate_and_extract.remote(args, sample, variant,
                                        variant_vcf_path, description))

    # Concatenate output files to create feature files
    sim_tsv_path = os.path.join(args.output, args.prefix + ".sim.tsv")
    real_tsv_path = os.path.join(args.output, args.prefix + ".real.tsv")
    logging.info("Extracting features (to %s and %s)", sim_tsv_path,
                 real_tsv_path)

    with open(sim_tsv_path, "w") as file:
        Features.header(out_file=file, ac=True)
    with open(real_tsv_path, "w") as file:
        Features.header(out_file=file, ac=False)

    with open(sim_tsv_path, "ab") as sim_sink, open(real_tsv_path,
                                                    "ab") as real_sink:
        for sim_result, real_result in tqdm(
                ray_iterator(record_results),
                total=len(record_results),
                desc="Extracting features",
        ):
            with open(sim_result, "rb") as source:
                shutil.copyfileobj(source, sim_sink)
            sim_sink.flush()
            with open(real_result, "rb") as source:
                shutil.copyfileobj(source, real_sink)
            real_sink.flush()

    # Perform genotyping
    with open(os.path.join(args.output, args.prefix + ".npsv.vcf"),
              "w") as gt_vcf_file:
        logging.info("Determining genotypes (output in %s)", gt_vcf_file.name)
        genotyping_args = argparse.Namespace(**vars(args))
        genotype_vcf(
            genotyping_args,
            args.input,
            sim_tsv_path,
            real_tsv_path,
            gt_vcf_file,
            samples=[sample.name],
        )
Example #9
0
 def test_handles_missing_gender(self):
     json_path = os.path.join(FILE_DIR, "stats.json")
     bam_path = os.path.join(FILE_DIR, "1_1598414_1598580_DEL.bam")
     sample_object = Sample.from_npsv(json_path, bam_path=bam_path)
     self.assertEqual(sample_object.gender, 0)