Example #1
0
 def test_run_simulations_and_get_percentile_allele_length_2(self):
     """test run_simulations and get_percentile"""
     simulator = genotype_confidence_simulator.GenotypeConfidenceSimulator(
         50, 300, 0.1, allele_length=2, iterations=5)
     simulator.run_simulations()
     expected_confidence_scores_percentiles = {
         26: 20.0,
         31: 40.0,
         37: 60.0,
         46: 80.0,
         51: 100.0,
     }
     self.assertEqual(
         expected_confidence_scores_percentiles,
         simulator.confidence_scores_percentiles,
     )
     self.assertEqual(20.00, simulator.get_percentile(26))
     self.assertEqual(40.00, simulator.get_percentile(31))
     # Try getting numbers that are not in the dict and will have to be inferred
     self.assertEqual(28.00, simulator.get_percentile(28))
     self.assertEqual(84.00, simulator.get_percentile(47))
     self.assertEqual(88.00, simulator.get_percentile(48))
     # Try values outside the range of what we already have
     self.assertEqual(0.00, simulator.get_percentile(25))
     self.assertEqual(0.00, simulator.get_percentile(24))
     self.assertEqual(100.00, simulator.get_percentile(51))
     self.assertEqual(100.00, simulator.get_percentile(52))
Example #2
0
def test_add_gt_conf_percentile_and_filters_to_vcf_file():
    """test _add_gt_conf_percentile_and_filters_to_vcf_file"""
    original_file = os.path.join(data_dir,
                                 "add_gt_conf_percentile_to_vcf_file.in.vcf")
    tmp_file = "tmp.adjudicator.add_gt_conf_percentile_to_vcf_file.vcf"
    expect_file = os.path.join(
        data_dir, "add_gt_conf_percentile_to_vcf_file.expect.vcf")
    shutil.copyfile(original_file, tmp_file)
    error_rate = 0.00026045894282438386
    simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator(
        60,
        100,
        error_rate,
        iterations=1000,
        call_hets=False,
    )
    simulations.run_simulations()
    adjudicator.Adjudicator._add_gt_conf_percentile_and_filters_to_vcf_file(
        tmp_file, simulations, min_dp=2, min_gcp=2.5, min_frs=0.9)
    assert filecmp.cmp(tmp_file, expect_file, shallow=False)
    os.unlink(tmp_file)
Example #3
0
    def run_gt_conf(self):
        if self.mean_depth > 0:
            simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator(
                self.mean_depth,
                self.variance_depth,
                self.read_error_rate,
                allele_length=1,
                iterations=self.genotype_simulation_iterations,
                call_hets=self.call_hets,
            )
            simulations.run_simulations(
                conf_scores_file=self.sim_conf_scores_file)
        else:
            simulations = None

        logging.info(
            f"Adding GT_CONF_PERCENTILE to final VCF file {self.final_vcf} & its debug counterpart, "
            f"using mean depth {self.mean_depth}, variance depth {self.variance_depth}, error rate {self.read_error_rate}, "
            f"and {self.genotype_simulation_iterations} simulation iterations")

        for f in [self.unfiltered_vcf_file, self.final_vcf]:
            if self.debug and f == self.final_vcf:
                scores_file = self.real_conf_scores_file
            else:
                scores_file = None
            Adjudicator._add_gt_conf_percentile_and_filters_to_vcf_file(
                f,
                simulations,
                min_dp=self.filter_min_dp,
                min_gcp=self.filter_min_gcp,
                min_frs=self.filter_min_frs,
                conf_scores_file=scores_file,
            )
            if scores_file is not None:
                Adjudicator._plot_gt_conf_hists(
                    scores_file,
                    self.sim_conf_scores_file,
                    self.genotype_hist_pdf,
                )
Example #4
0
    def _add_gt_conf_percentile_to_vcf_file(cls, vcf_file, mean_depth,
                                            depth_variance, error_rate,
                                            iterations):
        '''Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added'''
        simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator(
            mean_depth,
            depth_variance,
            error_rate,
            allele_length=1,
            iterations=iterations)
        simulations.run_simulations()
        vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
        for i, line in enumerate(vcf_header):
            if line.startswith('##FORMAT=<ID=GT_CONF'):
                break
        else:
            raise Exception(
                f'No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue'
            )

        vcf_header.insert(
            i + 1,
            r'''##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF"'''
        )

        with open(vcf_file, 'w') as f:
            print(*vcf_header, sep='\n', file=f)

            for vcf_record in vcf_lines:
                if 'GT_CONF' in vcf_record.FORMAT:
                    conf = int(round(float(vcf_record.FORMAT['GT_CONF'])))
                    if 'GT' in vcf_record.FORMAT and '.' not in vcf_record.FORMAT[
                            'GT']:
                        vcf_record.set_format_key_value(
                            'GT_CONF_PERCENTILE',
                            str(simulations.get_percentile(conf)))

                print(vcf_record, file=f)
Example #5
0
def test_run_simulations_and_get_percentile_allele_length_2():
    """test run_simulations and get_percentile"""
    simulator = genotype_confidence_simulator.GenotypeConfidenceSimulator(
        50, 300, 0.01, allele_length=2, iterations=5)
    simulator.run_simulations()
    expected_confidence_scores_percentiles = {
        193: 20.0,
        221: 40.0,
        271: 60.0,
        278: 80.0,
        303: 100.0
    }
    assert (simulator.confidence_scores_percentiles ==
            expected_confidence_scores_percentiles)
    assert simulator.get_percentile(193) == 20.00
    assert simulator.get_percentile(221) == 40.00
    # Try getting number that is not in the dict and will have to be inferred
    assert simulator.get_percentile(207) == 30.0
    # Try values outside the range of what we already have
    simulator.get_percentile(192) == 0.00
    simulator.get_percentile(191) == 0.00
    simulator.get_percentile(304) == 100.00
    simulator.get_percentile(305) == 100.00
Example #6
0
    def _add_gt_conf_percentile_and_filters_to_vcf_file(
            cls,
            vcf_file,
            mean_depth,
            depth_variance,
            error_rate,
            iterations,
            min_dp=2,
            min_gt_conf_percentile=2.5):
        '''Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added,
        and filter for DP and GT_CONF_PERCENTILE'''
        simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator(
            mean_depth,
            depth_variance,
            error_rate,
            allele_length=1,
            iterations=iterations)
        simulations.run_simulations()
        vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
        for i, line in enumerate(vcf_header):
            if line.startswith('##FORMAT=<ID=GT_CONF'):
                break
        else:
            raise Exception(
                f'No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue'
            )

        vcf_header.insert(
            i + 1,
            r'''##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">'''
        )
        vcf_header.insert(
            i + 1,
            f'##FILTER=<ID=MIN_DP,Description="Minimum DP of {min_dp}">')
        vcf_header.insert(
            i + 1,
            f'##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of {min_gt_conf_percentile}">'
        )

        with open(vcf_file, 'w') as f:
            print(*vcf_header, sep='\n', file=f)

            for vcf_record in vcf_lines:
                vcf_record.FILTER = set()

                if 'GT_CONF' in vcf_record.FORMAT:
                    conf = int(round(float(vcf_record.FORMAT['GT_CONF'])))
                    if 'GT' in vcf_record.FORMAT and '.' not in vcf_record.FORMAT[
                            'GT']:
                        vcf_record.set_format_key_value(
                            'GT_CONF_PERCENTILE',
                            str(simulations.get_percentile(conf)))
                        if 'DP' in vcf_record.FORMAT and float(
                                vcf_record.FORMAT['DP']) < min_dp:
                            vcf_record.FILTER.add('MIN_DP')
                        if float(vcf_record.FORMAT['GT_CONF_PERCENTILE']
                                 ) < min_gt_conf_percentile:
                            vcf_record.FILTER.add('MIN_GCP')
                        if len(vcf_record.FILTER) == 0:
                            vcf_record.FILTER.add('PASS')

                print(vcf_record, file=f)
Example #7
0
    def _add_gt_conf_percentile_and_filters_to_vcf_file(
        cls,
        vcf_file,
        mean_depth,
        depth_variance,
        error_rate,
        iterations,
        min_dp=5,
        min_gcp=5,
    ):
        """Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added,
        and filter for DP and GT_CONF_PERCENTILE"""
        if mean_depth > 0:
            simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator(
                mean_depth,
                depth_variance,
                error_rate,
                allele_length=1,
                iterations=iterations,
            )
            simulations.run_simulations()
        vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
        for i, line in enumerate(vcf_header):
            if line.startswith("##FORMAT=<ID=GT_CONF"):
                break
        else:
            raise Exception(
                f"No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue"
            )

        vcf_header.insert(
            i + 1,
            r"""##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">""",
        )
        vcf_header.insert(
            i + 1,
            f'##FILTER=<ID=MIN_DP,Description="Minimum DP of {min_dp}">')
        vcf_header.insert(
            i + 1,
            f'##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of {min_gcp}">',
        )

        with open(vcf_file, "w") as f:
            print(*vcf_header, sep="\n", file=f)

            for vcf_record in vcf_lines:
                vcf_record.FILTER = set()

                if "GT" in vcf_record.FORMAT and "GT_CONF" in vcf_record.FORMAT:
                    if "." not in vcf_record.FORMAT["GT"]:
                        conf = int(round(float(vcf_record.FORMAT["GT_CONF"])))
                        vcf_record.set_format_key_value(
                            "GT_CONF_PERCENTILE",
                            str(simulations.get_percentile(conf)))
                        if ("DP" in vcf_record.FORMAT
                                and float(vcf_record.FORMAT["DP"]) < min_dp):
                            vcf_record.FILTER.add("MIN_DP")
                        if float(vcf_record.FORMAT["GT_CONF_PERCENTILE"]
                                 ) < min_gcp:
                            vcf_record.FILTER.add("MIN_GCP")
                        if len(vcf_record.FILTER) == 0:
                            vcf_record.FILTER.add("PASS")
                    else:
                        # Add a default null percentile
                        vcf_record.set_format_key_value(
                            "GT_CONF_PERCENTILE", "0.0")

                print(vcf_record, file=f)