def test_get_region(self):
        """test get_region"""
        infile = os.path.join(data_dir, "all_reads.bam")
        tmp_out = "tmp.bam_read_extract.get_region.bam"

        expected_bam = os.path.join(data_dir, "region.1.60-181.bam")
        bam_read_extract.get_region(infile, "1", 59, 180, tmp_out)
        self.assertTrue(read_names_match(expected_bam, tmp_out))
        os.unlink(tmp_out)

        expected_bam = os.path.join(data_dir, "region.1.61-180.bam")
        bam_read_extract.get_region(infile, "1", 60, 179, tmp_out)
        self.assertTrue(read_names_match(expected_bam, tmp_out))
        os.unlink(tmp_out)
Beispiel #2
0
    def test_get_region(self):
        '''test get_region'''
        infile = os.path.join(data_dir, 'all_reads.bam')
        tmp_out = 'tmp.bam_read_extract.get_region.bam'

        expected_bam = os.path.join(data_dir, 'region.1.60-181.bam')
        bam_read_extract.get_region(infile, '1', 59, 180, tmp_out)
        self.assertTrue(read_names_match(expected_bam, tmp_out))
        os.unlink(tmp_out)

        expected_bam = os.path.join(data_dir, 'region.1.61-180.bam')
        bam_read_extract.get_region(infile, '1', 60, 179, tmp_out)
        self.assertTrue(read_names_match(expected_bam, tmp_out))
        os.unlink(tmp_out)
Beispiel #3
0
    def _run_quasimap_one_split(self, split_file, unmapped_reads_file=None):
        logging.info(f"Start quasimap on split file {split_file.filename}")
        split_reads_file = os.path.join(
            self.split_output_dir,
            f"split.{split_file.file_number}.reads.bam",
        )
        bam_read_extract.get_region(
            self.reads_files[0],
            split_file.chrom,
            split_file.chrom_start,
            split_file.chrom_end,
            split_reads_file,
        )

        quasimap_dir = os.path.join(
            self.split_output_dir,
            f"split.{split_file.file_number}.gramtools.quasimap",
        )
        if self.use_unmapped_reads:
            reads_files = [unmapped_reads_file, split_reads_file]
        else:
            reads_files = [split_reads_file]

        build_report, quasimap_report = gramtools.run_gramtools(
            split_file.gramtools_build_dir,
            quasimap_dir,
            split_file.filename,
            self.ref_fasta,
            reads_files,
            kmer_size=self.gramtools_kmer_size,
        )
        read_cov = self._get_read_coverage_one_split(split_file, quasimap_dir)

        if self.clean:
            os.unlink(split_reads_file)

        logging.info(f"Finish quasimap on split file {split_file.filename}")
        return read_cov, build_report, quasimap_report
Beispiel #4
0
    def _run_gramtools_with_split_vcf(self):
        logging.info('Splitting VCF files into chunks (if not already done)')
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            max_read_length=self.max_read_length,
            total_splits=self.total_splits,
            flank_length=self.max_read_length,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info('VCF file split into ' + str(chunker.total_split_files) +
                     ' chunks')
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Error('Error making output split directory ' +
                        self.split_output_dir)

        unmapped_reads_file = os.path.join(self.split_output_dir,
                                           'unmapped_reads.bam')
        bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                            unmapped_reads_file)
        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}
        mean_depths = []
        depth_variances = []

        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                logging.info(
                    '===== Start analysing variants in VCF split file ' +
                    split_file.filename + ' =====')
                split_reads_file = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.reads.bam')
                bam_read_extract.get_region(
                    self.reads_files[0],
                    split_file.chrom,
                    split_file.chrom_start,
                    split_file.chrom_end,
                    split_reads_file,
                )

                gramtools_quasimap_dir = os.path.join(
                    self.split_output_dir, 'split.' +
                    str(split_file.file_number) + '.gramtools.quasimap')

                build_report, quasimap_report = gramtools.run_gramtools(
                    split_file.gramtools_build_dir,
                    gramtools_quasimap_dir,
                    split_file.filename,
                    self.ref_fasta,
                    [unmapped_reads_file, split_reads_file],
                    self.max_read_length,
                    kmer_size=self.gramtools_kmer_size,
                )

                logging.info('Loading split gramtools quasimap output files ' +
                             gramtools_quasimap_dir)
                perl_generated_vcf = os.path.join(
                    split_file.gramtools_build_dir, 'perl_generated_vcf')
                mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
                    perl_generated_vcf, gramtools_quasimap_dir)
                mean_depths.append(mean_depth)
                depth_variances.append(depth_variance)
                logging.info('Finished loading gramtools files')
                if self.sample_name is None:
                    sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                        vcf_header)
                else:
                    sample_name = self.sample_name
                assert sample_name is not None
                split_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.out.vcf')
                unfiltered_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) +
                    '.out.debug.calls_with_zero_cov_alleles.vcf')
                logging.info('Writing VCf output file ' + split_vcf_out +
                             ' for split VCF file ' + split_file.filename)
                gramtools.write_vcf_annotated_using_coverage_from_gramtools(
                    mean_depth,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                    self.read_error_rate,
                    unfiltered_vcf_out,
                    self.gramtools_kmer_size,
                    sample_name=sample_name,
                    max_read_length=self.max_read_length,
                    filtered_outfile=split_vcf_out,
                )
                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

                if self.clean:
                    logging.info(
                        'Cleaning gramtools files from split VCF file ' +
                        split_file.filename)
                    if not self.user_supplied_gramtools_build_dir:
                        os.rename(
                            os.path.join(split_file.gramtools_build_dir,
                                         'build_report.json'),
                            split_file.gramtools_build_dir + '.report.json')
                        shutil.rmtree(split_file.gramtools_build_dir)
                        os.unlink(split_file.filename)

                    os.rename(
                        os.path.join(gramtools_quasimap_dir, 'report.json'),
                        gramtools_quasimap_dir + '.report.json')
                    shutil.rmtree(gramtools_quasimap_dir)
                    os.unlink(split_reads_file)

                logging.info(
                    '===== Finish analysing variants in VCF split file ' +
                    split_file.filename + ' =====')

        logging.info('Merging VCF files into one output file ' +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)

        mean_depth = statistics.mean(mean_depths)
        depth_variance = statistics.mean(depth_variances)
        logging.info(
            f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations'
        )
        Adjudicator._add_gt_conf_percentile_to_vcf_file(
            self.final_vcf, mean_depth, depth_variance, self.read_error_rate,
            self.genotype_simulation_iterations)

        if self.clean:
            logging.info('Deleting temp split VCF files')
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            os.unlink(unmapped_reads_file)
Beispiel #5
0
    def _run_gramtools_with_split_vcf(self):
        logging.info("Splitting VCF files into chunks (if not already done)")
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            max_read_length=self.max_read_length,
            total_splits=self.total_splits,
            flank_length=self.max_read_length,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info("VCF file split into " + str(chunker.total_split_files) +
                     " chunks")
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Exception("Error making output split directory " +
                            self.split_output_dir)

        if self.use_unmapped_reads:
            unmapped_reads_file = os.path.join(self.split_output_dir,
                                               "unmapped_reads.bam")
            bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                                unmapped_reads_file)

        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}

        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                logging.info(
                    "===== Start analysing variants in VCF split file " +
                    split_file.filename + " =====")
                split_reads_file = os.path.join(
                    self.split_output_dir,
                    "split." + str(split_file.file_number) + ".reads.bam",
                )
                bam_read_extract.get_region(
                    self.reads_files[0],
                    split_file.chrom,
                    split_file.chrom_start,
                    split_file.chrom_end,
                    split_reads_file,
                )

                gramtools_quasimap_dir = os.path.join(
                    self.split_output_dir,
                    "split." + str(split_file.file_number) +
                    ".gramtools.quasimap",
                )
                if self.use_unmapped_reads:
                    reads_files = [unmapped_reads_file, split_reads_file]
                else:
                    reads_files = [split_reads_file]

                split_vcf_out = os.path.join(
                    self.split_output_dir,
                    "split." + str(split_file.file_number) + ".out.vcf",
                )
                unfiltered_vcf_out = os.path.join(
                    self.split_output_dir,
                    "split." + str(split_file.file_number) +
                    ".out.debug.calls_with_zero_cov_alleles.vcf",
                )

                self.run_adjudicate(
                    split_file.gramtools_build_dir,
                    gramtools_quasimap_dir,
                    split_file.filename,
                    reads_files,
                    split_vcf_out,
                    unfiltered_vcf_out,
                )

                if self.clean:
                    os.unlink(split_reads_file)
                    if not self.user_supplied_gramtools_build_dir:
                        os.unlink(split_file.filename)

                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

                logging.info(
                    "===== Finish analysing variants in VCF split file " +
                    split_file.filename + " =====")

        logging.info("Merging VCF files into one output file " +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)

        self.run_gt_conf()

        if self.clean:
            logging.info("Deleting temp split VCF files")
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            if self.use_unmapped_reads:
                os.unlink(unmapped_reads_file)