def test_get_region(self): """test get_region""" infile = os.path.join(data_dir, "all_reads.bam") tmp_out = "tmp.bam_read_extract.get_region.bam" expected_bam = os.path.join(data_dir, "region.1.60-181.bam") bam_read_extract.get_region(infile, "1", 59, 180, tmp_out) self.assertTrue(read_names_match(expected_bam, tmp_out)) os.unlink(tmp_out) expected_bam = os.path.join(data_dir, "region.1.61-180.bam") bam_read_extract.get_region(infile, "1", 60, 179, tmp_out) self.assertTrue(read_names_match(expected_bam, tmp_out)) os.unlink(tmp_out)
def test_get_region(self): '''test get_region''' infile = os.path.join(data_dir, 'all_reads.bam') tmp_out = 'tmp.bam_read_extract.get_region.bam' expected_bam = os.path.join(data_dir, 'region.1.60-181.bam') bam_read_extract.get_region(infile, '1', 59, 180, tmp_out) self.assertTrue(read_names_match(expected_bam, tmp_out)) os.unlink(tmp_out) expected_bam = os.path.join(data_dir, 'region.1.61-180.bam') bam_read_extract.get_region(infile, '1', 60, 179, tmp_out) self.assertTrue(read_names_match(expected_bam, tmp_out)) os.unlink(tmp_out)
def _run_quasimap_one_split(self, split_file, unmapped_reads_file=None): logging.info(f"Start quasimap on split file {split_file.filename}") split_reads_file = os.path.join( self.split_output_dir, f"split.{split_file.file_number}.reads.bam", ) bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) quasimap_dir = os.path.join( self.split_output_dir, f"split.{split_file.file_number}.gramtools.quasimap", ) if self.use_unmapped_reads: reads_files = [unmapped_reads_file, split_reads_file] else: reads_files = [split_reads_file] build_report, quasimap_report = gramtools.run_gramtools( split_file.gramtools_build_dir, quasimap_dir, split_file.filename, self.ref_fasta, reads_files, kmer_size=self.gramtools_kmer_size, ) read_cov = self._get_read_coverage_one_split(split_file, quasimap_dir) if self.clean: os.unlink(split_reads_file) logging.info(f"Finish quasimap on split file {split_file.filename}") return read_cov, build_report, quasimap_report
def _run_gramtools_with_split_vcf(self): logging.info('Splitting VCF files into chunks (if not already done)') chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, max_read_length=self.max_read_length, total_splits=self.total_splits, flank_length=self.max_read_length, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info('VCF file split into ' + str(chunker.total_split_files) + ' chunks') try: os.mkdir(self.split_output_dir) except: raise Error('Error making output split directory ' + self.split_output_dir) unmapped_reads_file = os.path.join(self.split_output_dir, 'unmapped_reads.bam') bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} mean_depths = [] depth_variances = [] for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: logging.info( '===== Start analysing variants in VCF split file ' + split_file.filename + ' =====') split_reads_file = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.reads.bam') bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) gramtools_quasimap_dir = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.gramtools.quasimap') build_report, quasimap_report = gramtools.run_gramtools( split_file.gramtools_build_dir, gramtools_quasimap_dir, split_file.filename, self.ref_fasta, [unmapped_reads_file, split_reads_file], self.max_read_length, kmer_size=self.gramtools_kmer_size, ) logging.info('Loading split gramtools quasimap output files ' + gramtools_quasimap_dir) perl_generated_vcf = os.path.join( split_file.gramtools_build_dir, 'perl_generated_vcf') mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( perl_generated_vcf, gramtools_quasimap_dir) mean_depths.append(mean_depth) depth_variances.append(depth_variance) logging.info('Finished loading gramtools files') if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name assert sample_name is not None split_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.vcf') unfiltered_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.debug.calls_with_zero_cov_alleles.vcf') logging.info('Writing VCf output file ' + split_vcf_out + ' for split VCF file ' + split_file.filename) gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, unfiltered_vcf_out, self.gramtools_kmer_size, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=split_vcf_out, ) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) if self.clean: logging.info( 'Cleaning gramtools files from split VCF file ' + split_file.filename) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(split_file.gramtools_build_dir, 'build_report.json'), split_file.gramtools_build_dir + '.report.json') shutil.rmtree(split_file.gramtools_build_dir) os.unlink(split_file.filename) os.rename( os.path.join(gramtools_quasimap_dir, 'report.json'), gramtools_quasimap_dir + '.report.json') shutil.rmtree(gramtools_quasimap_dir) os.unlink(split_reads_file) logging.info( '===== Finish analysing variants in VCF split file ' + split_file.filename + ' =====') logging.info('Merging VCF files into one output file ' + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) mean_depth = statistics.mean(mean_depths) depth_variance = statistics.mean(depth_variances) logging.info( f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations' ) Adjudicator._add_gt_conf_percentile_to_vcf_file( self.final_vcf, mean_depth, depth_variance, self.read_error_rate, self.genotype_simulation_iterations) if self.clean: logging.info('Deleting temp split VCF files') for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) os.unlink(unmapped_reads_file)
def _run_gramtools_with_split_vcf(self): logging.info("Splitting VCF files into chunks (if not already done)") chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, max_read_length=self.max_read_length, total_splits=self.total_splits, flank_length=self.max_read_length, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info("VCF file split into " + str(chunker.total_split_files) + " chunks") try: os.mkdir(self.split_output_dir) except: raise Exception("Error making output split directory " + self.split_output_dir) if self.use_unmapped_reads: unmapped_reads_file = os.path.join(self.split_output_dir, "unmapped_reads.bam") bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: logging.info( "===== Start analysing variants in VCF split file " + split_file.filename + " =====") split_reads_file = os.path.join( self.split_output_dir, "split." + str(split_file.file_number) + ".reads.bam", ) bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) gramtools_quasimap_dir = os.path.join( self.split_output_dir, "split." + str(split_file.file_number) + ".gramtools.quasimap", ) if self.use_unmapped_reads: reads_files = [unmapped_reads_file, split_reads_file] else: reads_files = [split_reads_file] split_vcf_out = os.path.join( self.split_output_dir, "split." + str(split_file.file_number) + ".out.vcf", ) unfiltered_vcf_out = os.path.join( self.split_output_dir, "split." + str(split_file.file_number) + ".out.debug.calls_with_zero_cov_alleles.vcf", ) self.run_adjudicate( split_file.gramtools_build_dir, gramtools_quasimap_dir, split_file.filename, reads_files, split_vcf_out, unfiltered_vcf_out, ) if self.clean: os.unlink(split_reads_file) if not self.user_supplied_gramtools_build_dir: os.unlink(split_file.filename) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) logging.info( "===== Finish analysing variants in VCF split file " + split_file.filename + " =====") logging.info("Merging VCF files into one output file " + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) self.run_gt_conf() if self.clean: logging.info("Deleting temp split VCF files") for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) if self.use_unmapped_reads: os.unlink(unmapped_reads_file)