Esempio n. 1
0
def validate_reference_contig_coverage(ref_contigs, shared_contigs,
                                       min_coverage_fraction):
  """Validates that shared_contigs spans a sufficient amount of ref_contigs.

  Args:
    ref_contigs: List of ContigInfo protos. All of the contigs from our
      reference genome.
    shared_contigs: The subset of ref_contigs that we found in common with
      ref_contigs and all other genomics data sources.
    min_coverage_fraction: The minimum fraction of basepairs of ref_contigs that
      should be found among the shared_contigs.

  Raises:
    ValueError: If the fraction of covered bases is less than
      min_coverage_fraction.
  """

  def format_contig_matches():
    pieces = []
    common_map = ranges.contigs_dict(shared_contigs)
    for ref_contig in ref_contigs:
      status = 'matched' if ref_contig.name in common_map else 'IS MISSING'
      pieces.append('"{}" is {} bp and {}'.format(ref_contig.name,
                                                  ref_contig.n_bases, status))
    return ', '.join(pieces)

  ref_bp = ranges.contigs_n_bases(ref_contigs)
  common_bp = ranges.contigs_n_bases(shared_contigs)
  coverage = common_bp / (1. * ref_bp)
  if not shared_contigs or coverage < min_coverage_fraction:
    raise ValueError('Reference contigs span {} bases but only {} bases '
                     '({:.2%}) were found in common among our input files. '
                     'Check that the sources were created on a common genome '
                     'reference build. Contig matches were: {}'.format(
                         ref_bp, common_bp, coverage, format_contig_matches()))
Esempio n. 2
0
 def test_contigs_n_bases(self):
   c1 = reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0)
   c2 = reference_pb2.ContigInfo(name='a', n_bases=50, pos_in_fasta=1)
   c3 = reference_pb2.ContigInfo(name='b', n_bases=25, pos_in_fasta=2)
   self.assertEqual(100, ranges.contigs_n_bases([c1]))
   self.assertEqual(50, ranges.contigs_n_bases([c2]))
   self.assertEqual(25, ranges.contigs_n_bases([c3]))
   self.assertEqual(150, ranges.contigs_n_bases([c1, c2]))
   self.assertEqual(125, ranges.contigs_n_bases([c1, c3]))
   self.assertEqual(175, ranges.contigs_n_bases([c1, c2, c3]))