예제 #1
0
    def _nextflow_helper_process_input_vcf_file(cls, infile, out_small_vars,
                                                out_big_vars, out_sample_name,
                                                min_large_ref_length):
        splitter = vcf_file_split_deletions.VcfFileSplitDeletions(
            infile,
            out_small_vars,
            out_big_vars,
            min_large_ref_length=min_large_ref_length,
        )
        splitter.run()
        header_lines = vcf_file_read.get_header_lines_from_vcf_file(infile)
        sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
            header_lines)
        assert sample_name is not None
        max_read_length = None
        for line in header_lines:
            if line.startswith("##minos_max_read_length="):
                max_read_length = int(line.rstrip().split("=")[1])

        with open(out_sample_name, "w") as f:
            sample_name = vcf_file_read.get_sample_name_from_vcf_file(infile)
            assert sample_name is not None
            print(sample_name, file=f)

        return max_read_length
예제 #2
0
    def run(self):
        # Cluster together variants in each vcf
        if self.filter_and_cluster_vcf:
            EvaluateRecall._filter_vcf_for_clustering(self.truth_vcf_file, self.filtered_truth_vcf, self.discard_ref_calls)
            EvaluateRecall._filter_vcf_for_clustering(self.query_vcf_file, self.filtered_query_vcf, self.discard_ref_calls)
            if self.discard_ref_calls:
                clusterer_query = vcf_clusterer.VcfClusterer([self.filtered_query_vcf], self.query_vcf_ref, self.clustered_vcf_query, merge_method='simple', max_distance_between_variants=self.merge_length)
                clusterer_truth = vcf_clusterer.VcfClusterer([self.filtered_truth_vcf], self.truth_vcf_ref, self.clustered_vcf_truth, merge_method='simple', max_distance_between_variants=self.merge_length)
            else:
                clusterer_query = vcf_clusterer.VcfClusterer([self.filtered_query_vcf], self.query_vcf_ref, self.clustered_vcf_query, merge_method='gt_aware', max_distance_between_variants=self.merge_length)
                clusterer_truth = vcf_clusterer.VcfClusterer([self.filtered_truth_vcf], self.truth_vcf_ref, self.clustered_vcf_truth, merge_method='gt_aware', max_distance_between_variants=self.merge_length)
            clusterer_query.run()
            clusterer_truth.run()

        vcf_header, vcf_records_truth = vcf_file_read.vcf_file_to_dict(self.vcf_to_check_truth, sort=True, remove_useless_start_nucleotides=True)
        vcf_header, vcf_records_query = vcf_file_read.vcf_file_to_dict(self.vcf_to_check_query, sort=True, remove_useless_start_nucleotides=True)
        sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(vcf_header)
        if sample_from_header is None:
            sample_from_header = 'sample'
        truth_vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.truth_vcf_ref, truth_vcf_ref_seqs)
        query_vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.query_vcf_ref, query_vcf_ref_seqs)

        EvaluateRecall._write_vars_plus_flanks_to_fasta(self.seqs_out_truth, vcf_records_truth, truth_vcf_ref_seqs, self.flank_length, ref_only=True)
        EvaluateRecall._write_vars_plus_flanks_to_fasta(self.seqs_out_query, vcf_records_query, query_vcf_ref_seqs, self.flank_length, number_ns=self.number_ns)
        EvaluateRecall._map_seqs_to_seqs(self.seqs_out_query, self.seqs_out_truth, self.sam_file_out)
        #for f in glob.glob(self.seqs_out_truth + '*'):
            #os.unlink(f)
        #for f in glob.glob(self.seqs_out_query + '*'):
            #os.unlink(f)

        EvaluateRecall._index_vcf(self.vcf_to_check_query)
        self.vcf_to_check_query = self.vcf_to_check_query + ".gz"
        EvaluateRecall._parse_sam_files(self.vcf_to_check_truth, self.sam_file_out, self.vcf_to_check_query,
                                        self.sam_summary, self.flank_length,
                                        allow_mismatches=self.allow_flank_mismatches,
                                        exclude_regions=self.exclude_regions,
                                        max_soft_clipped=self.max_soft_clipped,
                                        number_ns=self.number_ns)
        stats, gt_conf_hist = EvaluateRecall._gather_stats(self.sam_summary)
        #os.unlink(self.seqs_out_truth)
        #os.unlink(self.seqs_out_truth)
        #for f in glob.glob(self.vcf_to_check_truth + '*'):
        #    os.unlink(f)
        #for f in glob.glob(self.vcf_to_check_query + '*'):
        #    os.unlink(f)

        # write stats file
        with open(self.stats_out, 'w') as f:
            keys = stats.keys()
            print(*keys, sep='\t', file=f)
            print(*[stats[x] for x in keys], sep='\t', file=f)


        # write GT_CONF histogram files
        with open(self.gt_conf_hist_out, 'w') as f:
            print('GT_CONF\tCount', file=f)
            for gt_conf, count in sorted(gt_conf_hist.items()):
                print(gt_conf, count, sep='\t', file=f)
예제 #3
0
    def test_get_sample_name_from_vcf_header_lines(self):
        """test get_sample_name_from_vcf_header_lines"""
        lines = ["foo", "bar"]
        self.assertEqual(
            None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines))

        lines.append("#CHROM\twrong!")
        with self.assertRaises(Exception):
            vcf_file_read.get_sample_name_from_vcf_header_lines(lines)

        lines[-1] = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
        self.assertEqual(
            None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines))

        lines[-1] = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
        self.assertEqual(
            None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines))

        lines[
            -1] = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name"
        self.assertEqual(
            "sample_name",
            vcf_file_read.get_sample_name_from_vcf_header_lines(lines))

        lines[
            -1] = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name\tsample_name_2"
        self.assertEqual(
            "sample_name",
            vcf_file_read.get_sample_name_from_vcf_header_lines(lines))
예제 #4
0
    def _run_gramtools_not_split_vcf(self):
        self.gramtools_kmer_size = Adjudicator._get_gramtools_kmer_size(
            self.gramtools_build_dir, self.gramtools_kmer_size)
        build_report, quasimap_report = gramtools.run_gramtools(
            self.gramtools_build_dir,
            self.gramtools_quasimap_dir,
            self.clustered_vcf,
            self.ref_fasta,
            self.reads_files,
            self.max_read_length,
            kmer_size=self.gramtools_kmer_size,
        )

        logging.info('Loading gramtools quasimap output files ' +
                     self.gramtools_quasimap_dir)
        mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            self.perl_generated_vcf, self.gramtools_quasimap_dir)
        logging.info('Finished loading gramtools files')
        if self.sample_name is None:
            sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                vcf_header)
        else:
            sample_name = self.sample_name
        assert sample_name is not None
        logging.info('Writing VCf output file ' + self.final_vcf)
        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            mean_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            self.read_error_rate,
            self.unfiltered_vcf_file,
            self.gramtools_kmer_size,
            sample_name=sample_name,
            max_read_length=self.max_read_length,
            filtered_outfile=self.final_vcf)

        logging.info(
            f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations'
        )
        Adjudicator._add_gt_conf_percentile_to_vcf_file(
            self.final_vcf, mean_depth, depth_variance, self.read_error_rate,
            self.genotype_simulation_iterations)

        if self.clean:
            os.rename(
                os.path.join(self.gramtools_quasimap_dir, 'report.json'),
                os.path.join(self.outdir, 'gramtools.quasimap.report.json'))
            shutil.rmtree(self.gramtools_quasimap_dir)

            if not self.user_supplied_gramtools_build_dir:
                os.rename(
                    os.path.join(self.gramtools_build_dir,
                                 'build_report.json'),
                    os.path.join(self.outdir, 'gramtools.build.report.json'))
                shutil.rmtree(self.gramtools_build_dir)
예제 #5
0
    def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files,
                       final_vcf, debug_vcf):
        build_report, quasimap_report = gramtools.run_gramtools(
            build_dir,
            quasimap_dir,
            vcf,
            self.ref_fasta,
            reads_files,
            self.max_read_length,
            kmer_size=self.gramtools_kmer_size,
        )

        build_vcf = os.path.join(build_dir, "build.vcf")

        logging.info("Loading gramtools quasimap output files " + quasimap_dir)
        mean_depth, variance_depth, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            build_vcf, quasimap_dir)
        Adjudicator.mean_depths.append(mean_depth)
        Adjudicator.variance_depths.append(variance_depth)

        logging.info("Finished loading gramtools files")

        if self.clean:
            os.rename(
                os.path.join(quasimap_dir, "quasimap_outputs",
                             "quasimap_report.json"),
                quasimap_dir + ".report.json",
            )
            shutil.rmtree(quasimap_dir)

            if not self.user_supplied_gramtools_build_dir:
                os.rename(
                    os.path.join(build_dir, "build_report.json"),
                    os.path.join(build_dir, "build.report.json"),
                )
                shutil.rmtree(build_dir)

        if self.sample_name is None:
            sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                vcf_header)
        else:
            sample_name = self.sample_name

        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            mean_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            self.read_error_rate,
            debug_vcf,
            sample_name=sample_name,
            max_read_length=self.max_read_length,
            filtered_outfile=final_vcf,
        )
예제 #6
0
    def test_get_sample_name_from_vcf_header_lines(self):
        '''test get_sample_name_from_vcf_header_lines'''
        lines = ['foo', 'bar']
        self.assertEqual(None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines))

        lines.append('#CHROM\twrong!')
        with self.assertRaises(vcf_file_read.Error):
            vcf_file_read.get_sample_name_from_vcf_header_lines(lines)

        lines[-1] = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
        self.assertEqual(None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines))

        lines[-1] = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT'
        self.assertEqual(None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines))

        lines[-1] = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name'
        self.assertEqual('sample_name', vcf_file_read.get_sample_name_from_vcf_header_lines(lines))

        lines[-1] = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name\tsample_name_2'
        self.assertEqual('sample_name', vcf_file_read.get_sample_name_from_vcf_header_lines(lines))
예제 #7
0
    def _load_vcf_files(cls, filename_list, homozygous_only=False, max_REF_len=None, min_SNP_qual=None, min_dp4=None, min_GT_conf=None):
        '''Loads all the vcf files from filename_list. Returns tuple of:
        1. Sample name. If more than one sample name found, uses the first one
        and warns to stderr
        2. Dictionary. filename => list of header lines for that file
        3. Dictionary. ref name => list of VcfRecords sorted by position'''
        headers = {}
        vcf_records = None
        sample_name = None

        for filename in filename_list:
            headers[filename], new_records = vcf_file_read.vcf_file_to_dict(filename, homozygous_only=homozygous_only, remove_asterisk_alts=True, max_REF_len=max_REF_len, remove_useless_start_nucleotides=True, min_SNP_qual=min_SNP_qual, min_dp4=min_dp4, min_GT_conf=min_GT_conf)

            new_sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(headers[filename])
            if sample_name is None and new_sample_name is not None:
                sample_name = new_sample_name
            elif new_sample_name != sample_name:
                logging.warning('Using first sample name found "' + str(sample_name) + '". Found a different (or no) sample name "' + str(new_sample_name) + '", which will not be used')

            if vcf_records is None:
                vcf_records = new_records
            else:
                for ref_name, record_list in new_records.items():
                    if ref_name not in vcf_records:
                        vcf_records[ref_name] = record_list
                    else:
                        vcf_records[ref_name].extend(record_list)

        for record_list in vcf_records.values():
            record_list.sort(key=operator.attrgetter('POS'))

        if sample_name is None:
            logging.warning('No sample name found in VCF files. Going to use "sample"')
            sample_name = 'sample'

        return sample_name, headers, vcf_records
예제 #8
0
    def run(self):
        if self.filter_and_cluster_vcf:
            MappingBasedVerifier._filter_vcf_for_clustering(
                self.vcf_file_in,
                self.filtered_vcf,
                discard_ref_calls=self.discard_ref_calls)
            if self.discard_ref_calls:
                clusterer = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf],
                    self.vcf_reference_file,
                    self.clustered_vcf,
                    merge_method='simple',
                    max_distance_between_variants=self.merge_length)
            else:
                clusterer = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf],
                    self.vcf_reference_file,
                    self.clustered_vcf,
                    merge_method='gt_aware',
                    max_distance_between_variants=self.merge_length)
            clusterer.run()

        vcf_header, vcf_records = vcf_file_read.vcf_file_to_dict(
            self.vcf_to_check,
            sort=True,
            remove_useless_start_nucleotides=True)
        sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(
            vcf_header)
        if sample_from_header is None:
            sample_from_header = 'sample'
        vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs)
        verify_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.verify_reference_file,
                                    verify_ref_seqs)

        MappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            self.seqs_out, vcf_records, vcf_ref_seqs, self.flank_length)
        MappingBasedVerifier._map_seqs_to_ref(self.seqs_out,
                                              self.verify_reference_file,
                                              self.sam_file_out)
        os.unlink(self.seqs_out)
        stats, gt_conf_hists = MappingBasedVerifier._parse_sam_file_and_update_vcf_records_and_gather_stats(
            self.sam_file_out,
            vcf_records,
            self.flank_length,
            verify_ref_seqs,
            allow_mismatches=self.allow_flank_mismatches,
            exclude_regions=self.exclude_regions,
            max_soft_clipped=self.max_soft_clipped)

        with open(self.vcf_file_out, 'w') as f:
            print(*vcf_header, sep='\n', file=f)
            for r in vcf_records:
                for v in vcf_records[r]:
                    print(v, file=f)

        # false negative stats, if possible
        stats['variant_regions_total'] = 'NA'
        stats['called_variant_regions'] = 'NA'

        if self.run_dnadiff:
            dnadiffer = dnadiff.Dnadiff(
                self.verify_reference_file,
                self.vcf_reference_file,
                self.dnadiff_outprefix,
            )
            dnadiffer.run()
            stats['variant_regions_total'], stats[
                'called_variant_regions'] = MappingBasedVerifier._get_total_length_of_expected_regions_called(
                    dnadiffer.all_variant_intervals, vcf_records)
            expected_variants = dnadiffer.variants
        elif self.expected_variants_vcf is not None:
            header, expected_variants = vcf_file_read.vcf_file_to_dict(
                self.expected_variants_vcf,
                sort=True,
                remove_useless_start_nucleotides=True)
        else:
            expected_variants = None

        if expected_variants is None:
            stats['false_negatives'] = 'NA'
        else:
            missed_vcf_records = MappingBasedVerifier._get_missing_vcf_records(
                vcf_records, expected_variants)
            stats['false_negatives'] = 0
            with open(self.vcf_false_negatives_file_out, 'w') as f:
                print('##fileformat=VCFv4.2', file=f)
                print('#CHROM',
                      'POS',
                      'ID',
                      'REF',
                      'ALT',
                      'QUAL',
                      'FILTER',
                      'INFO',
                      'FORMAT',
                      sample_from_header,
                      sep='\t',
                      file=f)
                for vcf_list in missed_vcf_records.values():
                    stats['false_negatives'] += len(vcf_list)
                    if len(vcf_list) > 0:
                        print(*vcf_list, sep='\n', file=f)

        # write stats file
        with open(self.stats_out, 'w') as f:
            keys = [
                'total', 'gt_correct', 'gt_wrong', 'gt_excluded', 'HET',
                'tp_edit_dist', 'fp_edit_dist', 'UNKNOWN_NO_GT',
                'variant_regions_total', 'called_variant_regions',
                'false_negatives'
            ]
            print(*keys, sep='\t', file=f)
            print(*[stats[x] for x in keys], sep='\t', file=f)

        # write GT_CONG histogram files
        for key, filename in self.gt_conf_hists_filenames.items():
            with open(filename, 'w') as f:
                print('GT_CONF\tCount', file=f)
                for gt_conf, count in sorted(gt_conf_hists[key].items()):
                    print(gt_conf, count, sep='\t', file=f)

        plots.plots_from_minos_vcf(self.vcf_file_out, self.vcf_file_plots_out)
    def run(self):
        # Write files of sequences to search for in each vcf
        DnadiffMappingBasedVerifier._write_dnadiff_plus_flanks_to_fastas(
            self.dnadiff_snps_file, self.dnadiff_file1, self.dnadiff_file2,
            self.seqs_out_dnadiff1, self.seqs_out_dnadiff2, self.flank_length)

        # Cluster together variants in each vcf
        if self.filter_and_cluster_vcf:
            DnadiffMappingBasedVerifier._filter_vcf_for_clustering(
                self.vcf_file_in1, self.filtered_vcf1, self.discard_ref_calls)
            DnadiffMappingBasedVerifier._filter_vcf_for_clustering(
                self.vcf_file_in2, self.filtered_vcf2, self.discard_ref_calls)
            if self.discard_ref_calls:
                clusterer1 = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf1],
                    self.vcf_reference_file,
                    self.clustered_vcf1,
                    merge_method='simple',
                    max_distance_between_variants=self.merge_length)
                clusterer2 = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf2],
                    self.vcf_reference_file,
                    self.clustered_vcf2,
                    merge_method='simple',
                    max_distance_between_variants=self.merge_length)
            else:
                clusterer1 = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf1],
                    self.vcf_reference_file,
                    self.clustered_vcf1,
                    merge_method='gt_aware',
                    max_distance_between_variants=self.merge_length)
                clusterer2 = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf2],
                    self.vcf_reference_file,
                    self.clustered_vcf2,
                    merge_method='gt_aware',
                    max_distance_between_variants=self.merge_length)
            clusterer1.run()
            clusterer2.run()

        vcf_header, vcf_records1 = vcf_file_read.vcf_file_to_dict(
            self.vcf_to_check1,
            sort=True,
            remove_useless_start_nucleotides=True)
        vcf_header, vcf_records2 = vcf_file_read.vcf_file_to_dict(
            self.vcf_to_check2,
            sort=True,
            remove_useless_start_nucleotides=True)
        sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(
            vcf_header)
        if sample_from_header is None:
            sample_from_header = 'sample'
        vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs)

        DnadiffMappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            self.seqs_out_vcf1, vcf_records1, vcf_ref_seqs, self.flank_length,
            self.number_ns)
        DnadiffMappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            self.seqs_out_vcf2, vcf_records2, vcf_ref_seqs, self.flank_length,
            self.number_ns)
        DnadiffMappingBasedVerifier._map_seqs_to_seqs(self.seqs_out_vcf1,
                                                      self.seqs_out_dnadiff1,
                                                      self.sam_file_out1)
        DnadiffMappingBasedVerifier._map_seqs_to_seqs(self.seqs_out_vcf2,
                                                      self.seqs_out_dnadiff2,
                                                      self.sam_file_out2)
        #for f in glob.glob(self.seqs_out_vcf1 + '*'):
        #os.unlink(f)
        #for f in glob.glob(self.seqs_out_vcf2 + '*'):
        #os.unlink(f)

        DnadiffMappingBasedVerifier._index_vcf(self.vcf_to_check1)
        self.vcf_to_check1 = self.vcf_to_check1 + ".gz"
        DnadiffMappingBasedVerifier._index_vcf(self.vcf_to_check2)
        self.vcf_to_check2 = self.vcf_to_check2 + ".gz"
        DnadiffMappingBasedVerifier._parse_sam_files(
            self.dnadiff_snps_file,
            self.sam_file_out1,
            self.sam_file_out2,
            self.vcf_to_check1,
            self.vcf_to_check2,
            self.seqs_out_dnadiff1,
            self.seqs_out_dnadiff2,
            self.sam_summary,
            self.flank_length,
            allow_mismatches=self.allow_flank_mismatches,
            exclude_regions1=self.exclude_regions1,
            exclude_regions2=self.exclude_regions2,
            max_soft_clipped=self.max_soft_clipped,
            number_ns=self.number_ns)
        stats, gt_conf_hist = DnadiffMappingBasedVerifier._gather_stats(
            self.sam_summary)
        #os.unlink(self.seqs_out_dnadiff1)
        #os.unlink(self.seqs_out_dnadiff2)
        #for f in glob.glob(self.vcf_to_check1 + '*'):
        #    os.unlink(f)
        #for f in glob.glob(self.vcf_to_check2 + '*'):
        #    os.unlink(f)

        # write stats file
        with open(self.stats_out, 'w') as f:
            keys = stats.keys()
            print(*keys, sep='\t', file=f)
            print(*[stats[x] for x in keys], sep='\t', file=f)

        # write GT_CONF histogram files
        with open(self.gt_conf_hist_out, 'w') as f:
            print('GT_CONF\tCount', file=f)
            for gt_conf, count in sorted(gt_conf_hist.items()):
                print(gt_conf, count, sep='\t', file=f)
예제 #10
0
    def _run_gramtools_with_split_vcf(self):
        logging.info('Splitting VCF files into chunks (if not already done)')
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            max_read_length=self.max_read_length,
            total_splits=self.total_splits,
            flank_length=self.max_read_length,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info('VCF file split into ' + str(chunker.total_split_files) +
                     ' chunks')
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Error('Error making output split directory ' +
                        self.split_output_dir)

        unmapped_reads_file = os.path.join(self.split_output_dir,
                                           'unmapped_reads.bam')
        bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                            unmapped_reads_file)
        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}
        mean_depths = []
        depth_variances = []

        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                logging.info(
                    '===== Start analysing variants in VCF split file ' +
                    split_file.filename + ' =====')
                split_reads_file = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.reads.bam')
                bam_read_extract.get_region(
                    self.reads_files[0],
                    split_file.chrom,
                    split_file.chrom_start,
                    split_file.chrom_end,
                    split_reads_file,
                )

                gramtools_quasimap_dir = os.path.join(
                    self.split_output_dir, 'split.' +
                    str(split_file.file_number) + '.gramtools.quasimap')

                build_report, quasimap_report = gramtools.run_gramtools(
                    split_file.gramtools_build_dir,
                    gramtools_quasimap_dir,
                    split_file.filename,
                    self.ref_fasta,
                    [unmapped_reads_file, split_reads_file],
                    self.max_read_length,
                    kmer_size=self.gramtools_kmer_size,
                )

                logging.info('Loading split gramtools quasimap output files ' +
                             gramtools_quasimap_dir)
                perl_generated_vcf = os.path.join(
                    split_file.gramtools_build_dir, 'perl_generated_vcf')
                mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
                    perl_generated_vcf, gramtools_quasimap_dir)
                mean_depths.append(mean_depth)
                depth_variances.append(depth_variance)
                logging.info('Finished loading gramtools files')
                if self.sample_name is None:
                    sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                        vcf_header)
                else:
                    sample_name = self.sample_name
                assert sample_name is not None
                split_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.out.vcf')
                unfiltered_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) +
                    '.out.debug.calls_with_zero_cov_alleles.vcf')
                logging.info('Writing VCf output file ' + split_vcf_out +
                             ' for split VCF file ' + split_file.filename)
                gramtools.write_vcf_annotated_using_coverage_from_gramtools(
                    mean_depth,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                    self.read_error_rate,
                    unfiltered_vcf_out,
                    self.gramtools_kmer_size,
                    sample_name=sample_name,
                    max_read_length=self.max_read_length,
                    filtered_outfile=split_vcf_out,
                )
                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

                if self.clean:
                    logging.info(
                        'Cleaning gramtools files from split VCF file ' +
                        split_file.filename)
                    if not self.user_supplied_gramtools_build_dir:
                        os.rename(
                            os.path.join(split_file.gramtools_build_dir,
                                         'build_report.json'),
                            split_file.gramtools_build_dir + '.report.json')
                        shutil.rmtree(split_file.gramtools_build_dir)
                        os.unlink(split_file.filename)

                    os.rename(
                        os.path.join(gramtools_quasimap_dir, 'report.json'),
                        gramtools_quasimap_dir + '.report.json')
                    shutil.rmtree(gramtools_quasimap_dir)
                    os.unlink(split_reads_file)

                logging.info(
                    '===== Finish analysing variants in VCF split file ' +
                    split_file.filename + ' =====')

        logging.info('Merging VCF files into one output file ' +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)

        mean_depth = statistics.mean(mean_depths)
        depth_variance = statistics.mean(depth_variances)
        logging.info(
            f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations'
        )
        Adjudicator._add_gt_conf_percentile_to_vcf_file(
            self.final_vcf, mean_depth, depth_variance, self.read_error_rate,
            self.genotype_simulation_iterations)

        if self.clean:
            logging.info('Deleting temp split VCF files')
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            os.unlink(unmapped_reads_file)
예제 #11
0
    def _load_vcf_files(
        cls,
        filename_list,
        reference_seqs,
        homozygous_only=False,
        max_REF_len=None,
        min_SNP_qual=None,
        min_dp4=None,
        min_GT_conf=None,
    ):
        """Loads all the vcf files from filename_list. Returns tuple of:
        1. Sample name. If more than one sample name found, uses the first one
        and warns to stderr
        2. Dictionary. filename => list of header lines for that file
        3. Dictionary. ref name => list of VcfRecords sorted by position.

        reference_seqs should be a dictionary of sequence name -> sequence.
        This causes all records from the VCF to be sanity checked against the reference sequence,
        and any records where the REF seq does not match the expected sequence is removed."""
        headers = {}
        vcf_records = None
        sample_name = None

        for filename in filename_list:
            headers[filename], new_records = vcf_file_read.vcf_file_to_dict(
                filename,
                homozygous_only=homozygous_only,
                remove_asterisk_alts=True,
                max_REF_len=max_REF_len,
                remove_useless_start_nucleotides=True,
                min_SNP_qual=min_SNP_qual,
                min_dp4=min_dp4,
                min_GT_conf=min_GT_conf,
                reference_seqs=reference_seqs,
                error_on_bad_POS=False,
            )

            new_sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                headers[filename]
            )
            if sample_name is None and new_sample_name is not None:
                sample_name = new_sample_name
            elif new_sample_name != sample_name:
                logging.warning(
                    'Using first sample name found "'
                    + str(sample_name)
                    + '". Found a different (or no) sample name "'
                    + str(new_sample_name)
                    + '", which will not be used'
                )

            if vcf_records is None:
                vcf_records = new_records
            else:
                for ref_name, record_list in new_records.items():
                    if ref_name not in vcf_records:
                        vcf_records[ref_name] = record_list
                    else:
                        vcf_records[ref_name].extend(record_list)

        for record_list in vcf_records.values():
            record_list.sort(key=operator.attrgetter("POS"))

        if sample_name is None:
            logging.warning('No sample name found in VCF files. Going to use "sample"')
            sample_name = "sample"

        return sample_name, headers, vcf_records
예제 #12
0
    def run(self):
        if self.filter_and_cluster_vcf:
            MappingBasedVerifier._filter_vcf_for_clustering(
                self.vcf_file_in,
                self.filtered_vcf,
                discard_ref_calls=self.discard_ref_calls,
            )
            if self.discard_ref_calls:
                clusterer = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf],
                    self.vcf_reference_file,
                    self.clustered_vcf,
                    merge_method="simple",
                    cluster_boundary_size=self.merge_length,
                )
            else:
                clusterer = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf],
                    self.vcf_reference_file,
                    self.clustered_vcf,
                    merge_method="gt_aware",
                    cluster_boundary_size=self.merge_length,
                )
            clusterer.run()

        vcf_header, vcf_records = vcf_file_read.vcf_file_to_dict(
            self.vcf_to_check,
            sort=True,
            remove_useless_start_nucleotides=True)
        sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(
            vcf_header)
        if sample_from_header is None:
            sample_from_header = "sample"
        vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs)
        verify_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.verify_reference_file,
                                    verify_ref_seqs)

        MappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            self.seqs_out, vcf_records, vcf_ref_seqs, self.flank_length)
        MappingBasedVerifier._map_seqs_to_ref(self.seqs_out,
                                              self.verify_reference_file,
                                              self.sam_file_out)
        os.unlink(self.seqs_out)
        stats, gt_conf_hists = MappingBasedVerifier._parse_sam_file_and_update_vcf_records_and_gather_stats(
            self.sam_file_out,
            vcf_records,
            self.flank_length,
            verify_ref_seqs,
            allow_mismatches=self.allow_flank_mismatches,
            exclude_regions=self.exclude_regions,
            max_soft_clipped=self.max_soft_clipped,
        )

        with open(self.vcf_file_out, "w") as f:
            print(*vcf_header, sep="\n", file=f)
            for r in vcf_records:
                for v in vcf_records[r]:
                    print(v, file=f)

        # false negative stats, if possible
        stats["variant_regions_total"] = "NA"
        stats["called_variant_regions"] = "NA"

        if self.run_dnadiff:
            dnadiffer = dnadiff.Dnadiff(
                self.verify_reference_file,
                self.vcf_reference_file,
                self.dnadiff_outprefix,
            )
            dnadiffer.run()
            stats["variant_regions_total"], stats[
                "called_variant_regions"] = MappingBasedVerifier._get_total_length_of_expected_regions_called(
                    dnadiffer.all_variant_intervals, vcf_records)
            expected_variants = dnadiffer.variants
        elif self.expected_variants_vcf is not None:
            header, expected_variants = vcf_file_read.vcf_file_to_dict(
                self.expected_variants_vcf,
                sort=True,
                remove_useless_start_nucleotides=True,
            )
        else:
            expected_variants = None

        if expected_variants is None:
            stats["false_negatives"] = "NA"
        else:
            missed_vcf_records = MappingBasedVerifier._get_missing_vcf_records(
                vcf_records, expected_variants)
            stats["false_negatives"] = 0
            with open(self.vcf_false_negatives_file_out, "w") as f:
                print("##fileformat=VCFv4.2", file=f)
                print(
                    "#CHROM",
                    "POS",
                    "ID",
                    "REF",
                    "ALT",
                    "QUAL",
                    "FILTER",
                    "INFO",
                    "FORMAT",
                    sample_from_header,
                    sep="\t",
                    file=f,
                )
                for vcf_list in missed_vcf_records.values():
                    stats["false_negatives"] += len(vcf_list)
                    if len(vcf_list) > 0:
                        print(*vcf_list, sep="\n", file=f)

        # write stats file
        with open(self.stats_out, "w") as f:
            keys = [
                "total",
                "gt_correct",
                "gt_wrong",
                "gt_excluded",
                "HET",
                "tp_edit_dist",
                "fp_edit_dist",
                "UNKNOWN_NO_GT",
                "variant_regions_total",
                "called_variant_regions",
                "false_negatives",
            ]
            print(*keys, sep="\t", file=f)
            print(*[stats[x] for x in keys], sep="\t", file=f)

        # write GT_CONG histogram files
        for key, filename in self.gt_conf_hists_filenames.items():
            with open(filename, "w") as f:
                print("GT_CONF\tCount", file=f)
                for gt_conf, count in sorted(gt_conf_hists[key].items()):
                    print(gt_conf, count, sep="\t", file=f)

        plots.plots_from_minos_vcf(self.vcf_file_out, self.vcf_file_plots_out)