Esempi in Python per SamReader, esempi in Python per third_party.nucleus.io.sam.SamReader

Esempio n. 1

0

Mostra file

File: sam_test.py Progetto: PhilPalmer/deepvariant-1

 def test_roundtrip_writer(self, filename):
   output_path = test_utils.test_tmpfile(filename)
   original_reader = sam.SamReader(test_utils.genomics_core_testdata(filename))
   original_records = list(original_reader.iterate())
   with sam.SamWriter(output_path, header=original_reader.header) as writer:
     for record in original_records:
       writer.write(record)
   with sam.SamReader(output_path) as new_reader:
     self.assertEqual(original_records, list(new_reader.iterate()))

Esempio n. 2

0

Mostra file

File: sam_test.py Progetto: zorrodong/deepvariant

 def _make_reader(self, filename, has_embedded_ref):
     if has_embedded_ref:
         # If we have an embedded reference, force the reader to use it by not
         # providing an argument for ref_path.
         return sam.SamReader(test_utils.genomics_core_testdata(filename))
     else:
         # Otherwise we need to explicitly override the reference encoded in the UR
         # of the CRAM file to use the path provided to our test.fasta.
         return sam.SamReader(
             test_utils.genomics_core_testdata(filename),
             ref_path=test_utils.genomics_core_testdata('test.fasta'))

Esempio n. 3

0

Mostra file

File: make_examples.py Progetto: zuxfoucault/deepvariant

 def _make_sam_reader(self):
   return sam.SamReader(
       self.options.reads_filename,
       read_requirements=self.options.read_requirements,
       hts_block_size=FLAGS.hts_block_size,
       downsample_fraction=self.options.downsample_fraction,
       random_seed=self.options.random_seed)

Esempio n. 4

0

Mostra file

    def test_call_from_allele_counter(self):
        ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
        sam_reader = sam.SamReader(testdata.CHR20_BAM)
        size = 1000
        region = ranges.make_range('chr20', 10000000, 10000000 + size)
        allele_counter = _allelecounter.AlleleCounter(
            ref.c_reader, region,
            deepvariant_pb2.AlleleCounterOptions(partition_size=size))
        caller = variant_calling.VariantCaller(
            deepvariant_pb2.VariantCallerOptions(min_count_snps=2,
                                                 min_count_indels=2,
                                                 min_fraction_snps=0.12,
                                                 min_fraction_indels=0.12,
                                                 sample_name='sample_name',
                                                 p_error=0.001,
                                                 max_gq=50,
                                                 gq_resolution=1,
                                                 ploidy=2))

        # Grab all of the reads in our region and add them to the allele_counter.
        reads = list(sam_reader.query(region))
        self.assertNotEmpty(reads)
        for read in reads:
            allele_counter.add(read)

        # Get the candidates records for this whole region.
        candidates = caller.calls_from_allele_counter(allele_counter)

        # We should have at least some candidates and some gvcf records.
        self.assertNotEmpty(candidates)

        # Each candidate should be a DeepVariantCall.
        for candidate in candidates:
            self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)

Esempio n. 5

0

Mostra file

File: realigner_test.py Progetto: palc/deepvariant

  def test_realigner_end2end(self):
    ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
    config = realigner.realigner_config(FLAGS)
    reads_realigner = realigner.Realigner(config, ref_reader)
    region_str = 'chr20:10,000,000-10,009,999'
    windows_count = 0

    regions = ranges.RangeSet.from_regions([region_str])
    for region in regions.partition(1000):
      with sam.SamReader(
          testdata.CHR20_BAM,
          read_requirements=reads_pb2.ReadRequirements()) as sam_reader:
        in_reads = list(sam_reader.query(region))
      windows, out_reads = reads_realigner.realign_reads(in_reads, region)

      # We should always get back all of the reads we sent in. Instead of just
      # checking the lengths are the same, make sure all the read names are the
      # same.
      self.assertCountEqual([r.fragment_name for r in in_reads],
                            [r.fragment_name for r in out_reads])

      # Check each window to make sure it's reasonable.
      for window in windows:
        # We always expect the reference sequence to be one of our haplotypes.
        ref_seq = ref_reader.query(window.span)
        self.assertIn(ref_seq, set(window.haplotypes))
      windows_count += len(windows)

    self.assertGreater(windows_count, 0)

Esempio n. 6

0

Mostra file

File: generate_trained_model.py Progetto: cgpu/deepvariant-1

def generate_trained_model_runner(truth_variants, reads, ref,
                                  output_model_proto, output_model_pckl,
                                  exclude_contig, from_contig, random_seed,
                                  indel_weight):
    """Runner for generate_trained_model.

  Args:
    truth_variants: path to the VCF.
    reads: path to the reads BAM.
    ref: path to the reference FASTA.
    output_model_proto: path to write the AlleleCountLinearModel proto.
    output_model_pckl: path to write the LogisticRegression pickle.
    exclude_contig: string identifier of a contig to exclude from training,
    from_contig: string identifier of the contig from which we sample baseline.
    random_seed: int used as random seed for reproducibility.
    indel_weight: float of the weight od indels relative to the rest in
      the training.
  """
    vcf_reader = vcf.VcfReader(truth_variants)
    ref_reader = fasta.IndexedFastaReader(ref)
    sam_reader = sam.SamReader(reads)

    random.seed(random_seed)

    dataframe = generate_data(vcf_reader, ref_reader, sam_reader, from_contig,
                              exclude_contig)
    model = train_model(dataframe, indel_weight=indel_weight)

    if output_model_pckl:
        joblib.dump(model, output_model_pckl)

    model_proto = model_to_proto(model)
    with tf.gfile.GFile(output_model_proto, 'w') as f:
        f.write(text_format.MessageToString(model_proto))

Esempio n. 7

0

Mostra file

File: sam_test.py Progetto: zyxue/deepvariant

 def test_sam_query(self):
   reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam'))
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)

Esempio n. 8

0

Mostra file

File: sam_test.py Progetto: PhilPalmer/deepvariant-1

 def test_roundtrip_cram_writer(self, filename, has_embedded_ref):
   output_path = test_utils.test_tmpfile(filename)
   writer_ref_path = test_utils.genomics_core_testdata('test.fasta')
   reader_ref_path = ''
   if not has_embedded_ref:
     reader_ref_path = writer_ref_path
   original_reader = sam.SamReader(
       test_utils.genomics_core_testdata(filename), ref_path=reader_ref_path)
   original_records = list(original_reader.iterate())
   with sam.SamWriter(
       output_path,
       header=original_reader.header,
       ref_path=writer_ref_path,
       embed_ref=has_embedded_ref) as writer:
     for record in original_records:
       writer.write(record)
   with sam.SamReader(output_path, ref_path=reader_ref_path) as new_reader:
     self.assertEqual(original_records, list(new_reader.iterate()))

Esempio n. 9

0

Mostra file

File: sam_test.py Progetto: zorrodong/deepvariant

 def _parse_read_with_aux_tags(self, tag_string):
     # Minimal header line to create a valid SAM file.
     header_lines = '@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\n'
     # A single stock read we'll add our AUX fields to.
     read = 'read_name\t0\tchr1\t1\t0\t3M\t*\t0\t0\tCCC\tAAA\t' + tag_string
     path = test_utils.test_tmpfile('aux_tags.bam')
     with gfile.Open(path, 'w') as fout:
         fout.write(header_lines)
         fout.write(read + '\n')
     with sam.SamReader(path, parse_aux_fields=True) as reader:
         return list(reader.iterate())

Esempio n. 10

0

Mostra file

File: debruijn_graph_wrap_test.py Progetto: cgpu/deepvariant-1

    def test_straightforward_region(self):
        ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
        bam_reader = sam.SamReader(testdata.CHR20_BAM)
        region = ranges.parse_literal('chr20:10,000,000-10,000,100')
        ref_seq = ref_reader.query(region)

        all_reads = list(bam_reader.query(region))
        dbg30 = debruijn_graph.build(ref_seq, all_reads,
                                     self.single_k_dbg_options(30))
        self.assertIsNotNone(dbg30)
        self.assertEqual([ref_seq], dbg30.candidate_haplotypes())

Esempio n. 11

0

Mostra file

File: sam_test.py Progetto: zorrodong/deepvariant

 def test_bam_iterate_partially(self):
     """Verify that iteration provides results incrementally, not all at once."""
     reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam'))
     with reader:
         iterable = reader.iterate()
         # We expect 106 records in total.
         for _ in range(10):
             results = list(itertools.islice(iterable, 10))
             self.assertEqual(len(results), 10)
         results = list(itertools.islice(iterable, 10))
         self.assertEqual(len(results), 6)

Esempio n. 12

0

Mostra file

def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl,
                            eval_region, output_report_csv):
    """Outputs precision-recall for a sklearn model using AlleleCount features.

  Args:
    truth_variants: path to the VCF.
    reads: path to the reads BAM.
    ref: path to the reference FASTA.
    input_model_pckl: path to read the LogisticRegression pickle from.
    eval_region: str, region to evaluate on in the 'chr:start-end',
      'chr:position' or 'chr' format.
    output_report_csv: path to the output report csv.

  Raises:
    ValueError: if eval_region cannot be parsed.
  """
    sam_reader = sam.SamReader(reads)
    ref_reader = fasta.IndexedFastaReader(ref)

    read_reqs = reads_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)
    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=1, read_requirements=read_reqs)

    model = joblib.load(input_model_pckl)

    with vcf.VcfReader(truth_variants) as vcf_reader:
        region = ranges.parse_literal(eval_region,
                                      contig_map=ranges.contigs_dict(
                                          ref_reader.header.contigs))
        true_indels = [
            var for var in vcf_reader.query(region)
            if (variant_utils.is_indel(var))
        ]

    precisions = compute_precision(model, true_indels, sam_reader, ref_reader,
                                   allele_counter_options, _THRESHOLDS, region)
    recalls = compute_effective_recall(model, true_indels, sam_reader,
                                       ref_reader, allele_counter_options,
                                       _THRESHOLDS)

    with tf.gfile.GFile(output_report_csv, 'w') as csvfile:
        fieldnames = ['threshold', 'precision', 'recall']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for threshold in _THRESHOLDS:
            writer.writerow({
                'threshold': threshold,
                'precision': precisions[threshold],
                'recall': recalls[threshold]
            })

Esempio n. 13

0

Mostra file

File: debruijn_graph_wrap_test.py Progetto: cgpu/deepvariant-1

 def test_complex_region(self):
     # There is a heterozygous 9 bp deletion of tandem TGA repeat.
     # "chr20:10,095,379-10,095,500"
     ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
     bam_reader = sam.SamReader(testdata.CHR20_BAM)
     region = ranges.parse_literal('chr20:10,095,379-10,095,500')
     ref_seq = ref_reader.query(region)
     reads = list(bam_reader.query(region))
     dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options())
     self.assertIsNotNone(dbg)
     self.assertEqual(44, dbg.kmer_size)
     self.assertEqual(2, len(dbg.candidate_haplotypes()))
     self.assertIn(ref_seq, dbg.candidate_haplotypes())

Esempio n. 14

0

Mostra file

File: sam_test.py Progetto: zyxue/deepvariant

 def test_downsampling(self, method, maybe_range, fraction, expected_n_reads):
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'),
       downsample_fraction=fraction,
       random_seed=12345)
   with reader:
     if method == 'iterate':
       reads_iter = reader.iterate()
     elif method == 'query':
       reads_iter = reader.query(ranges.parse_literal(maybe_range))
     else:
       self.fail('Unexpected method ' + str(method))
     self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)

Esempio n. 15

0

Mostra file

 def test_wrap(self):
   ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
   sam_reader = sam.SamReader(testdata.CHR20_BAM)
   size = 100
   region = ranges.make_range('chr20', 10000000, 10000000 + size)
   options = deepvariant_pb2.AlleleCounterOptions(partition_size=size)
   allele_counter = _allelecounter.AlleleCounter(ref.c_reader, region, options)
   reads = list(sam_reader.query(region))
   self.assertGreater(len(reads), 0)
   for read in reads:
     allele_counter.add(read)
   counts = allele_counter.counts()
   self.assertEqual(len(counts), size)

Esempio n. 16

0

Mostra file

File: make_examples.py Progetto: zyxue/deepvariant

def processing_regions_from_options(options):
    """Computes the calling regions from our options.

  This function does all of the work needed to read our input files and region
  specifications to determine the list of regions we should generate examples
  over. It also computes the confident regions needed to label variants.

  Args:
    options: deepvariant.DeepVariantOptions proto containing information about
      our input data sources.

  Raises:
    ValueError: if the regions to call is empty.

  Returns:
    Two values. The first is a list of nucleus.genomics.v1.Range protos of the
    regions we should process. The second is a RangeSet containing the confident
    regions for labeling, or None if we are running in training mode.
  """
    ref_contigs = fasta.RefFastaReader(
        options.reference_filename).header.contigs
    sam_contigs = sam.SamReader(options.reads_filename).header.contigs

    # Add in confident regions and vcf_contigs if in training mode.
    vcf_contigs = None
    if in_training_mode(options):
        vcf_contigs = vcf.VcfReader(
            options.truth_variants_filename).header.contigs

    contigs = _ensure_consistent_contigs(ref_contigs, sam_contigs, vcf_contigs,
                                         options.exclude_contigs,
                                         options.min_shared_contigs_basepairs)
    logging.info('Common contigs are %s', [c.name for c in contigs])
    calling_regions = build_calling_regions(ref_contigs,
                                            options.calling_regions,
                                            options.exclude_calling_regions)
    if not calling_regions:
        raise ValueError(
            'The regions to call is empty. Check your --regions and '
            '--exclude_regions flags to make sure they are not '
            'resulting in set of empty region to process. This also '
            'happens if you use "chr20" for a BAM where contig names '
            'don\'t have "chr"s (or vice versa).')
    regions = regions_to_process(
        contigs=contigs,
        partition_size=options.allele_counter_options.partition_size,
        calling_regions=calling_regions,
        task_id=options.task_id,
        num_shards=options.num_shards)

    return regions

Esempio n. 17

0

Mostra file

File: genomics_io_noplugin_test.py Progetto: zuxfoucault/deepvariant

 def test_tfbam_plugin_does_not_load(self):
     with self.assertRaisesRegexp(
             ImportError,
             'tfbam_lib module not found, cannot read .tfbam files.'):
         _ = sam.SamReader('*****@*****.**', use_index=True)

Esempio n. 18

0

Mostra file

File: make_examples.py Progetto: zuxfoucault/deepvariant

def default_options(add_flags=True, flags_obj=None):
  """Creates a DeepVariantOptions proto populated with reasonable defaults.

  Args:
    add_flags: bool. defaults to True. If True, we will push the value of
      certain FLAGS into our options. If False, those option fields are left
      uninitialized.
    flags_obj: object.  If not None, use as the source of flags,
      else use global FLAGS.

  Returns:
    deepvariant_pb2.DeepVariantOptions protobuf.

  Raises:
    ValueError: If we observe invalid flag values.
  """
  if not flags_obj:
    flags_obj = FLAGS

  read_reqs = reads_pb2.ReadRequirements(
      min_base_quality=10,
      min_mapping_quality=10,
      min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)

  pic_options = pileup_image.default_options(read_requirements=read_reqs)

  allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
      partition_size=flags_obj.partition_size, read_requirements=read_reqs)

  if flags_obj.sample_name:
    sample_name = flags_obj.sample_name
  elif flags_obj.reads:
    with sam.SamReader(flags_obj.reads) as sam_reader:
      sample_name = extract_sample_name_from_sam_reader(sam_reader)
  else:
    sample_name = _UNKNOWN_SAMPLE

  variant_caller_options = deepvariant_pb2.VariantCallerOptions(
      min_count_snps=flags_obj.vsc_min_count_snps,
      min_count_indels=flags_obj.vsc_min_count_indels,
      min_fraction_snps=flags_obj.vsc_min_fraction_snps,
      min_fraction_indels=flags_obj.vsc_min_fraction_indels,
      # Not specified by default: fraction_reference_sites_to_emit,
      # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
      random_seed=1400605801,
      sample_name=sample_name,
      p_error=0.001,
      max_gq=50,
      gq_resolution=flags_obj.gvcf_gq_binsize,
      ploidy=2)

  options = deepvariant_pb2.DeepVariantOptions(
      exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS,
      # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
      random_seed=609314161,
      # # Not specified by default: calling_regions = 3;
      read_requirements=read_reqs,
      allele_counter_options=allele_counter_options,
      variant_caller_options=variant_caller_options,
      pic_options=pic_options,
      n_cores=1,
      task_id=0,
      num_shards=0,
      min_shared_contigs_basepairs=0.9,
  )

  if add_flags:
    options.mode = parse_proto_enum_flag(
        deepvariant_pb2.DeepVariantOptions.Mode, flags_obj.mode.upper())

    options.labeler_algorithm = parse_proto_enum_flag(
        deepvariant_pb2.DeepVariantOptions.LabelerAlgorithm,
        flags_obj.labeler_algorithm.upper())

    if flags_obj.ref:
      options.reference_filename = flags_obj.ref
    if flags_obj.reads:
      options.reads_filename = flags_obj.reads
    if flags_obj.confident_regions:
      options.confident_regions_filename = flags_obj.confident_regions
    if flags_obj.truth_variants:
      options.truth_variants_filename = flags_obj.truth_variants

    if flags_obj.downsample_fraction != NO_DOWNSAMPLING:
      options.downsample_fraction = flags_obj.downsample_fraction

    if flags_obj.multi_allelic_mode:
      multi_allelic_enum = {
          'include_het_alt_images':
              deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES,
          'exclude_het_alt_images':
              deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES,
      }[flags_obj.multi_allelic_mode]
      options.pic_options.multi_allelic_mode = multi_allelic_enum

    if flags_obj.pileup_image_height:
      options.pic_options.height = flags_obj.pileup_image_height
    if flags_obj.pileup_image_width:
      options.pic_options.width = flags_obj.pileup_image_width

    num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs(
        flags_obj.task, flags_obj.examples or '', flags_obj.candidates or '',
        flags_obj.gvcf or '')
    options.examples_filename = examples
    options.candidates_filename = candidates
    options.gvcf_filename = gvcf

    options.calling_regions.extend(parse_regions_flag(flags_obj.regions))
    options.exclude_calling_regions.extend(
        parse_regions_flag(flags_obj.exclude_regions))

    options.task_id = flags_obj.task
    options.num_shards = 0 if num_shards is None else num_shards

    options.realigner_enabled = flags_obj.realign_reads
    if options.realigner_enabled:
      options.realigner_options.CopyFrom(realigner.realigner_config(flags_obj))

    options.max_reads_per_partition = flags_obj.max_reads_per_partition

    if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING and
        flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF):
      options.variant_caller_options.fraction_reference_sites_to_emit = (
          flags_obj.training_random_emit_ref_sites)

  return options

Esempio n. 19

0

Mostra file

File: sam_test.py Progetto: zorrodong/deepvariant

 def test_sam_iterate(self):
     reader = sam.SamReader(test_utils.genomics_core_testdata('test.sam'))
     with reader:
         self.assertEqual(test_utils.iterable_len(reader.iterate()), 6)

Esempio n. 20

0

Mostra file

 def test_tfbam_plugin_loads(self):
     reader = sam.SamReader('*****@*****.**')
     self.assertIsNotNone(reader)

Esempio n. 21

0

Mostra file

File: realigner_test.py Progetto: palc/deepvariant

def _get_reads(region):
  with sam.SamReader(testdata.CHR20_BAM) as in_sam_reader:
    return list(in_sam_reader.query(region))

Esempio n. 22

0

Mostra file

 def test_bam_iterate(self):
     reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam'),
                            use_index=False)
     with reader:
         self.assertEqual(test_utils.iterable_len(reader.iterate()), 106)