def test_make_read(self): bases = 'ACG' quals = [30, 40, 50] cigar = '3M' mapq = 42 chrom = 'chr10' start = 123 name = 'myname' read = test_utils.make_read(bases, quals=quals, cigar=cigar, mapq=mapq, chrom=chrom, start=start, name=name) self.assertEqual(read.aligned_sequence, bases) self.assertEqual(read.aligned_quality, quals) self.assertEqual(list(read.alignment.cigar), [ cigar_pb2.CigarUnit(operation_length=3, operation=cigar_pb2.CigarUnit.ALIGNMENT_MATCH) ]) self.assertEqual(read.alignment.mapping_quality, mapq) self.assertEqual(read.alignment.position.reference_name, chrom) self.assertEqual(read.alignment.position.position, start) self.assertEqual(read.fragment_name, name)
def _add_cigar_unit(self, cigar, cigar_op, cigar_len): if cigar_len <= 0: return if cigar and cigar[-1].operation == cigar_op: cigar[-1].operation_length += cigar_len else: cigar_unit = cigar_pb2.CigarUnit(operation=cigar_op, operation_length=cigar_len) cigar.extend([cigar_unit])
def to_cigar_unit(source): """Creates a cigar_pb2 CigarUnit from source. This function attempts to convert source into a CigarUnit protobuf. If source is a string, it must be a single CIGAR string specification like '12M'. If source is a tuple or a list, must have exactly two elements (operation_length, opstr). operation_length can be a string or int, and must be >= 1. opstr should be a single character CIGAR specification (e.g., 'M'). If source is already a CigarUnit, it is just passed through unmodified. Args: source: many types allowed. The object we want to convert to a CigarUnit proto. Returns: CigarUnit proto with operation_length and operation set to values from source. Raises: ValueError: if source cannot be converted or is malformed. """ try: if isinstance(source, cigar_pb2.CigarUnit): return source elif isinstance(source, six.string_types): l, op = source[:-1], source[-1] elif isinstance(source, (tuple, list)): l, op = source else: raise ValueError('Unexpected source', source) if isinstance(op, six.string_types): op = CHAR_TO_CIGAR_OPS[op] l = int(l) if l < 1: raise ValueError('Length must be >= 1', l) return cigar_pb2.CigarUnit(operation=op, operation_length=int(l)) except (KeyError, IndexError): raise ValueError( 'Failed to convert {} into a CigarUnit'.format(source))
from __future__ import absolute_import from __future__ import division from __future__ import print_function import itertools from absl.testing import absltest from absl.testing import parameterized from third_party.nucleus.protos import cigar_pb2 from third_party.nucleus.util import cigar _CIGAR_TUPLES_AND_CIGAR_UNITS = [ ((1, 'M'), cigar_pb2.CigarUnit( operation=cigar_pb2.CigarUnit.ALIGNMENT_MATCH, operation_length=1)), ((2, 'I'), cigar_pb2.CigarUnit( operation=cigar_pb2.CigarUnit.INSERT, operation_length=2)), ((3, 'D'), cigar_pb2.CigarUnit( operation=cigar_pb2.CigarUnit.DELETE, operation_length=3)), ((4, 'N'), cigar_pb2.CigarUnit( operation=cigar_pb2.CigarUnit.SKIP, operation_length=4)), ((5, 'S'), cigar_pb2.CigarUnit( operation=cigar_pb2.CigarUnit.CLIP_SOFT, operation_length=5)), ((6, 'H'), cigar_pb2.CigarUnit( operation=cigar_pb2.CigarUnit.CLIP_HARD, operation_length=6)),
def trim_cigar(cigar, ref_trim, ref_length): """Trim a cigar string to a certain reference length. Args: cigar: list of `nucleus.protos.CigarUnit`s of the original read alignment. ref_trim: integer. Number of reference bases to trim off the beginning of the read. ref_length: integer. Number of reference bases to cover with the read, the middle part that is not trimmed from the start or end of the read. Returns: new_cigar: list of `nucleus.protos.CigarUnit`s of the final read alignment, after the left and/or right have been trimmed off. read_trim: The number of bases of the read that are trimmed off. new_read_length: The number of bases of the read that remain after trimming. This is different from the final number of reference bases; for example, an insertion makes the read longer without affecting the reference. """ # First consume the ref until the trim is covered. trim_remaining = ref_trim # Then consume the ref until the ref_length is covered. ref_to_cover_remaining = ref_length read_trim = 0 new_cigar = [] new_read_length = 0 for cigar_unit in cigar: c_operation_length = cigar_unit.operation_length # Each operation moves forward in the ref, the read, or both. advances_ref = cigar_unit.operation in cigar_utils.REF_ADVANCING_OPS advances_read = cigar_unit.operation in cigar_utils.READ_ADVANCING_OPS ref_step = c_operation_length if advances_ref else 0 # First, use up each operation until the trimmed area is covered. if trim_remaining > 0: if ref_step <= trim_remaining: # Fully apply to the trim. trim_remaining -= ref_step read_trim += c_operation_length if advances_read else 0 continue else: # Partially apply to finish the trim. ref_step -= trim_remaining read_trim += trim_remaining if advances_read else 0 # If trim finishes here, the rest of the ref_step can apply to the # next stage and count towards covering the given ref window. c_operation_length = ref_step trim_remaining = 0 # Once the trim is done, start applying cigar entries to covering the ref # window. if trim_remaining == 0: if ref_step <= ref_to_cover_remaining: # Fully apply to the window. new_cigar.append( cigar_pb2.CigarUnit(operation=cigar_unit.operation, operation_length=c_operation_length)) ref_to_cover_remaining -= ref_step new_read_length += c_operation_length if advances_read else 0 else: # Partially apply to finish the window. c_operation_length = ref_to_cover_remaining new_cigar.append( cigar_pb2.CigarUnit(operation=cigar_unit.operation, operation_length=c_operation_length)) new_read_length += c_operation_length if advances_read else 0 ref_to_cover_remaining = 0 break return new_cigar, read_trim, new_read_length