Beispiel #1
0
    def test_write_vcf_annotated_using_coverage_from_gramtools(self):
        """test write_vcf_annotated_using_coverage_from_gramtools"""
        vcf_file_in = os.path.join(
            data_dir,
            "write_vcf_annotated_using_coverage_from_gramtools.in.vcf")
        quasimap_dir = os.path.join(
            data_dir,
            "write_vcf_annotated_using_coverage_from_gramtools.quasimap")
        mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            vcf_file_in, quasimap_dir)
        tmp_outfile = (
            "tmp.gramtools.write_vcf_annotated_using_coverage_from_gramtools.vcf"
        )
        tmp_outfile_filtered = tmp_outfile + ".filter.vcf"
        error_rate = 0.001
        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            mean_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            error_rate,
            tmp_outfile,
            sample_name="sample_42",
            max_read_length=200,
            filtered_outfile=tmp_outfile_filtered,
        )
        expected_vcf = os.path.join(
            data_dir,
            "write_vcf_annotated_using_coverage_from_gramtools.out.vcf")
        expected_vcf_filtered = os.path.join(
            data_dir,
            "write_vcf_annotated_using_coverage_from_gramtools.out.vcf.filter.vcf",
        )

        # Today's date and the verison of minos get added to the header.
        # We'll have to take account
        # of those by fixing what we get from the expected file
        def check_vcfs(expected_vcf, got_vcf):
            expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list(
                expected_vcf)
            got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(
                got_vcf)
            for i in range(len(expected_header)):
                if expected_header[i].startswith("##fileDate="):
                    expected_header[i] = "##fileDate=" + str(
                        datetime.date.today())
                elif expected_header[i].startswith("##source=minos"):
                    expected_header[
                        i] = "##source=minos, version " + minos_version

            self.assertEqual(expected_header, got_header)
            self.assertEqual(expected_vcf_records, got_vcf_records)

        check_vcfs(expected_vcf, tmp_outfile)
        check_vcfs(expected_vcf_filtered, tmp_outfile_filtered)
        os.unlink(tmp_outfile)
        os.unlink(tmp_outfile_filtered)
Beispiel #2
0
    def _run_gramtools_not_split_vcf(self):
        self.gramtools_kmer_size = Adjudicator._get_gramtools_kmer_size(
            self.gramtools_build_dir, self.gramtools_kmer_size)
        build_report, quasimap_report = gramtools.run_gramtools(
            self.gramtools_build_dir,
            self.gramtools_quasimap_dir,
            self.clustered_vcf,
            self.ref_fasta,
            self.reads_files,
            self.max_read_length,
            kmer_size=self.gramtools_kmer_size,
        )

        logging.info('Loading gramtools quasimap output files ' +
                     self.gramtools_quasimap_dir)
        mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            self.perl_generated_vcf, self.gramtools_quasimap_dir)
        logging.info('Finished loading gramtools files')
        if self.sample_name is None:
            sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                vcf_header)
        else:
            sample_name = self.sample_name
        assert sample_name is not None
        logging.info('Writing VCf output file ' + self.final_vcf)
        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            mean_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            self.read_error_rate,
            self.unfiltered_vcf_file,
            self.gramtools_kmer_size,
            sample_name=sample_name,
            max_read_length=self.max_read_length,
            filtered_outfile=self.final_vcf)

        logging.info(
            f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations'
        )
        Adjudicator._add_gt_conf_percentile_to_vcf_file(
            self.final_vcf, mean_depth, depth_variance, self.read_error_rate,
            self.genotype_simulation_iterations)

        if self.clean:
            os.rename(
                os.path.join(self.gramtools_quasimap_dir, 'report.json'),
                os.path.join(self.outdir, 'gramtools.quasimap.report.json'))
            shutil.rmtree(self.gramtools_quasimap_dir)

            if not self.user_supplied_gramtools_build_dir:
                os.rename(
                    os.path.join(self.gramtools_build_dir,
                                 'build_report.json'),
                    os.path.join(self.outdir, 'gramtools.build.report.json'))
                shutil.rmtree(self.gramtools_build_dir)
Beispiel #3
0
    def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files,
                       final_vcf, debug_vcf):
        build_report, quasimap_report = gramtools.run_gramtools(
            build_dir,
            quasimap_dir,
            vcf,
            self.ref_fasta,
            reads_files,
            kmer_size=self.gramtools_kmer_size,
        )

        build_vcf = os.path.join(build_dir, "build.vcf")

        logging.info("Loading gramtools quasimap output files " + quasimap_dir)
        (
            self.mean_depth,
            self.variance_depth,
            vcf_header,
            vcf_records,
            allele_coverage,
            allele_groups,
        ) = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            build_vcf, quasimap_dir)

        logging.info("Finished loading gramtools files")

        if self.clean:
            os.rename(
                os.path.join(quasimap_dir, "quasimap_outputs",
                             "quasimap_report.json"),
                self.gramtools_quasimap_json,
            )
            shutil.rmtree(quasimap_dir)

            if not self.user_supplied_gramtools_build_dir:
                os.rename(
                    os.path.join(build_dir, "build_report.json"),
                    self.gramtools_build_json,
                )
                shutil.rmtree(build_dir)

        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            self.mean_depth,
            self.variance_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            self.read_error_rate,
            debug_vcf,
            sample_name=self.sample_name,
            filtered_outfile=final_vcf,
            ref_seq_lengths=self.ref_seq_lengths,
            call_hets=self.call_hets,
        )
Beispiel #4
0
    def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files,
                       final_vcf, debug_vcf):
        build_report, quasimap_report = gramtools.run_gramtools(
            build_dir,
            quasimap_dir,
            vcf,
            self.ref_fasta,
            reads_files,
            self.max_read_length,
            kmer_size=self.gramtools_kmer_size,
        )

        build_vcf = os.path.join(build_dir, "build.vcf")

        logging.info("Loading gramtools quasimap output files " + quasimap_dir)
        mean_depth, variance_depth, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            build_vcf, quasimap_dir)
        Adjudicator.mean_depths.append(mean_depth)
        Adjudicator.variance_depths.append(variance_depth)

        logging.info("Finished loading gramtools files")

        if self.clean:
            os.rename(
                os.path.join(quasimap_dir, "quasimap_outputs",
                             "quasimap_report.json"),
                quasimap_dir + ".report.json",
            )
            shutil.rmtree(quasimap_dir)

            if not self.user_supplied_gramtools_build_dir:
                os.rename(
                    os.path.join(build_dir, "build_report.json"),
                    os.path.join(build_dir, "build.report.json"),
                )
                shutil.rmtree(build_dir)

        if self.sample_name is None:
            sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                vcf_header)
        else:
            sample_name = self.sample_name

        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            mean_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            self.read_error_rate,
            debug_vcf,
            sample_name=sample_name,
            max_read_length=self.max_read_length,
            filtered_outfile=final_vcf,
        )
Beispiel #5
0
    def _run_gramtools_with_split_vcf(self):
        logging.info('Splitting VCF files into chunks (if not already done)')
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            max_read_length=self.max_read_length,
            total_splits=self.total_splits,
            flank_length=self.max_read_length,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info('VCF file split into ' + str(chunker.total_split_files) +
                     ' chunks')
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Error('Error making output split directory ' +
                        self.split_output_dir)

        unmapped_reads_file = os.path.join(self.split_output_dir,
                                           'unmapped_reads.bam')
        bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                            unmapped_reads_file)
        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}
        mean_depths = []
        depth_variances = []

        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                logging.info(
                    '===== Start analysing variants in VCF split file ' +
                    split_file.filename + ' =====')
                split_reads_file = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.reads.bam')
                bam_read_extract.get_region(
                    self.reads_files[0],
                    split_file.chrom,
                    split_file.chrom_start,
                    split_file.chrom_end,
                    split_reads_file,
                )

                gramtools_quasimap_dir = os.path.join(
                    self.split_output_dir, 'split.' +
                    str(split_file.file_number) + '.gramtools.quasimap')

                build_report, quasimap_report = gramtools.run_gramtools(
                    split_file.gramtools_build_dir,
                    gramtools_quasimap_dir,
                    split_file.filename,
                    self.ref_fasta,
                    [unmapped_reads_file, split_reads_file],
                    self.max_read_length,
                    kmer_size=self.gramtools_kmer_size,
                )

                logging.info('Loading split gramtools quasimap output files ' +
                             gramtools_quasimap_dir)
                perl_generated_vcf = os.path.join(
                    split_file.gramtools_build_dir, 'perl_generated_vcf')
                mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
                    perl_generated_vcf, gramtools_quasimap_dir)
                mean_depths.append(mean_depth)
                depth_variances.append(depth_variance)
                logging.info('Finished loading gramtools files')
                if self.sample_name is None:
                    sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                        vcf_header)
                else:
                    sample_name = self.sample_name
                assert sample_name is not None
                split_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.out.vcf')
                unfiltered_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) +
                    '.out.debug.calls_with_zero_cov_alleles.vcf')
                logging.info('Writing VCf output file ' + split_vcf_out +
                             ' for split VCF file ' + split_file.filename)
                gramtools.write_vcf_annotated_using_coverage_from_gramtools(
                    mean_depth,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                    self.read_error_rate,
                    unfiltered_vcf_out,
                    self.gramtools_kmer_size,
                    sample_name=sample_name,
                    max_read_length=self.max_read_length,
                    filtered_outfile=split_vcf_out,
                )
                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

                if self.clean:
                    logging.info(
                        'Cleaning gramtools files from split VCF file ' +
                        split_file.filename)
                    if not self.user_supplied_gramtools_build_dir:
                        os.rename(
                            os.path.join(split_file.gramtools_build_dir,
                                         'build_report.json'),
                            split_file.gramtools_build_dir + '.report.json')
                        shutil.rmtree(split_file.gramtools_build_dir)
                        os.unlink(split_file.filename)

                    os.rename(
                        os.path.join(gramtools_quasimap_dir, 'report.json'),
                        gramtools_quasimap_dir + '.report.json')
                    shutil.rmtree(gramtools_quasimap_dir)
                    os.unlink(split_reads_file)

                logging.info(
                    '===== Finish analysing variants in VCF split file ' +
                    split_file.filename + ' =====')

        logging.info('Merging VCF files into one output file ' +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)

        mean_depth = statistics.mean(mean_depths)
        depth_variance = statistics.mean(depth_variances)
        logging.info(
            f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations'
        )
        Adjudicator._add_gt_conf_percentile_to_vcf_file(
            self.final_vcf, mean_depth, depth_variance, self.read_error_rate,
            self.genotype_simulation_iterations)

        if self.clean:
            logging.info('Deleting temp split VCF files')
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            os.unlink(unmapped_reads_file)
Beispiel #6
0
    def _run_gramtools_with_split_vcf(self):
        logging.info("Splitting VCF files into chunks (if not already done)")
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            total_splits=self.total_splits,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info("VCF file split into " + str(chunker.total_split_files) +
                     " chunks")
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Exception("Error making output split directory " +
                            self.split_output_dir)

        if self.use_unmapped_reads:
            unmapped_reads_file = os.path.join(self.split_output_dir,
                                               "unmapped_reads.bam")
            bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                                unmapped_reads_file)
        else:
            unmapped_reads_file = None

        read_coverage = []
        build_reports = {}
        quasimap_reports = {}

        # Run gramtools quasimap on each split. Get back the read depth
        # from each split, which we need to get the global read depth and
        # variance, to then use for genotyping
        for ref_name, split_file_list in chunker.vcf_split_files.items():
            for split_file in split_file_list:
                read_cov, build_report, quasimap_report = self._run_quasimap_one_split(
                    split_file, unmapped_reads_file)
                read_coverage.extend(read_cov)
                build_reports[split_file.file_number] = build_report
                quasimap_reports[split_file.file_number] = quasimap_report

        with open(self.gramtools_quasimap_json, "w") as f:
            json.dump(quasimap_reports, f, indent=2, sort_keys=True)
        if not self.user_supplied_gramtools_build_dir:
            with open(self.gramtools_build_json, "w") as f:
                json.dump(build_reports, f, indent=2, sort_keys=True)

        self.mean_depth = round(statistics.mean(read_coverage), 3)
        self.variance_depth = round(statistics.variance(read_coverage), 3)

        # Can now genotype each split VCF, using the global mean depth and variance
        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}
        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                build_vcf = os.path.join(split_file.gramtools_build_dir,
                                         "build.vcf")
                quasimap_dir = os.path.join(
                    self.split_output_dir,
                    f"split.{split_file.file_number}.gramtools.quasimap",
                )
                logging.info(f"Loading gramtools quasimap output files " +
                             quasimap_dir)
                (
                    _,  # mean depth for this split, which we don't want
                    _,  # depth variance for this split, which we don't want
                    vcf_header,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                ) = gramtools.load_gramtools_vcf_and_allele_coverage_files(
                    build_vcf, quasimap_dir)
                logging.info("Finished loading gramtools files")

                if self.clean:
                    shutil.rmtree(quasimap_dir)

                vcf_prefix = os.path.join(
                    self.split_output_dir,
                    f"split.{split_file.file_number}.out",
                )
                split_vcf_out = f"{vcf_prefix}.vcf"
                unfiltered_vcf_out = (
                    f"{vcf_prefix}.debug.calls_with_zero_cov_alleles.vcf")

                gramtools.write_vcf_annotated_using_coverage_from_gramtools(
                    self.mean_depth,
                    self.variance_depth,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                    self.read_error_rate,
                    unfiltered_vcf_out,
                    sample_name=self.sample_name,
                    filtered_outfile=split_vcf_out,
                    ref_seq_lengths=self.ref_seq_lengths,
                    call_hets=self.call_hets,
                )

                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

        # We now have minos run on each split VCF. Merge into one VCF, then can
        # add gt conf and gcp to the merged VCF.
        logging.info("Merging VCF files into one output file " +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)
        self.run_gt_conf()

        if self.clean:
            logging.info("Deleting temp split VCF files")
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            if self.use_unmapped_reads:
                os.unlink(unmapped_reads_file)