def generate_positions(vcf_reader, ref_reader, baseline_contig):
    """Gets all INDELs position and an equal amount of SNPs and random positions.

  Args:
    vcf_reader: a nucleus.io.VcfReader.
    ref_reader: a nucleus.io.IndexedFastaReader.
    baseline_contig: contig from which to sample baseline positions.

  Returns:
    A list of PositionWrapper.
  """
    variants = [variant for variant in vcf_reader]
    indels_positions = [
        PositionWrapper(var.reference_name, var.start, _INDEL_LABEL)
        for var in variants if variant_utils.is_indel(var)
    ]
    n_indels = len(indels_positions)

    # We sort by position for better data locality.
    snps = [var for var in variants if variant_utils.is_snp(var)]
    snps_positions = [
        PositionWrapper(var.reference_name, var.start, _SNP_LABEL)
        for var in random.sample(snps, min(len(snps), n_indels))
    ]

    contig_size = ref_reader.contig(baseline_contig).n_bases
    # NOTE: Though unlikely, these random positions can end up on actual
    # variants.
    baseline_positions = [
        PositionWrapper(baseline_contig, pos, _REF_LABEL)
        for pos in random.sample(xrange(contig_size), min(
            contig_size, n_indels))
    ]

    return sorted(indels_positions + snps_positions + baseline_positions)
Beispiel #2
0
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl,
                            eval_region, output_report_csv):
    """Outputs precision-recall for a sklearn model using AlleleCount features.

  Args:
    truth_variants: path to the VCF.
    reads: path to the reads BAM.
    ref: path to the reference FASTA.
    input_model_pckl: path to read the LogisticRegression pickle from.
    eval_region: str, region to evaluate on in the 'chr:start-end',
      'chr:position' or 'chr' format.
    output_report_csv: path to the output report csv.

  Raises:
    ValueError: if eval_region cannot be parsed.
  """
    sam_reader = sam.SamReader(reads)
    ref_reader = fasta.IndexedFastaReader(ref)

    read_reqs = reads_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)
    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=1, read_requirements=read_reqs)

    model = joblib.load(input_model_pckl)

    with vcf.VcfReader(truth_variants) as vcf_reader:
        region = ranges.parse_literal(eval_region,
                                      contig_map=ranges.contigs_dict(
                                          ref_reader.header.contigs))
        true_indels = [
            var for var in vcf_reader.query(region)
            if (variant_utils.is_indel(var))
        ]

    precisions = compute_precision(model, true_indels, sam_reader, ref_reader,
                                   allele_counter_options, _THRESHOLDS, region)
    recalls = compute_effective_recall(model, true_indels, sam_reader,
                                       ref_reader, allele_counter_options,
                                       _THRESHOLDS)

    with tf.gfile.GFile(output_report_csv, 'w') as csvfile:
        fieldnames = ['threshold', 'precision', 'recall']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for threshold in _THRESHOLDS:
            writer.writerow({
                'threshold': threshold,
                'precision': precisions[threshold],
                'recall': recalls[threshold]
            })
Beispiel #3
0
def encoded_variant_type(variant):
    """Gets the EncodedVariantType for variant.

  This function examines variant and returns the EncodedVariantType that best
  describes the variation type of variant. For example, if variant has
  `reference_bases = "A"` and `alternative_bases = ["C"]` this function would
  return EncodedVariantType.SNP.

  Args:
    variant: nucleus.Variant proto. The variant whose EncodedVariantType we want
      to get.

  Returns:
    EncodedVariantType enum value.
  """
    if variant_utils.is_snp(variant):
        return EncodedVariantType.SNP
    elif variant_utils.is_indel(variant):
        return EncodedVariantType.INDEL
    else:
        return EncodedVariantType.UNKNOWN
Beispiel #4
0
 def test_is_indel(self, variant, expected):
   self.assertEqual(variant_utils.is_indel(variant), expected)
 def test_is_indel_symbolic_allele(self, variant, exclude_alleles, expected):
   self.assertEqual(
       variant_utils.is_indel(variant, exclude_alleles=exclude_alleles),
       expected)
 def test_is_indel(self, variant, expected):
   self.assertEqual(variant_utils.is_indel(variant), expected)