def _merge_records(vcf, cpx_records, cpx_record_ids): """ r1, r2 : iter of pysam.VariantRecord """ def _next_record(): try: return next(vcf) except StopIteration: return None def _next_cpx(): try: return cpx_records.popleft() except IndexError: return None # Initialize merge curr_record = _next_record() curr_cpx = _next_cpx() while curr_record is not None and curr_cpx is not None: # Remove VCF records that were included in complex event if curr_record.id in cpx_record_ids: curr_record = _next_record() continue # Merge sort remaining if curr_record.chrom == curr_cpx.chrom: if curr_record.pos <= curr_cpx.pos: yield curr_record curr_record = _next_record() else: yield curr_cpx curr_cpx = _next_cpx() elif svu.is_smaller_chrom(curr_record.chrom, curr_cpx.chrom): yield curr_record curr_record = _next_record() else: yield curr_cpx curr_cpx = _next_cpx() # After one iterator is exhausted, return rest of other iterator if curr_record is None: for cpx in itertools.chain([curr_cpx], cpx_records): yield cpx elif curr_cpx is None: for record in itertools.chain([curr_record], vcf): if record.id not in cpx_record_ids: yield record
def _merge_records(vcf, cpx_records, cpx_record_ids): """ r1, r2 : iter of pysam.VariantRecord """ def _next_record(): # Skip VCF records that were included in complex event # get next record that's not already present in cpx_record_idss _rec = next(vcf, None) while _rec is not None and _rec.id in cpx_record_ids: _rec = next(vcf, None) return _rec def _next_cpx(): try: return cpx_records.popleft() except IndexError: return None # Initialize merge curr_record = _next_record() curr_cpx = _next_cpx() while curr_record is not None and curr_cpx is not None: # Merge sort records not in complex event if curr_record.chrom == curr_cpx.chrom: if curr_record.pos <= curr_cpx.pos: yield curr_record curr_record = _next_record() else: yield curr_cpx curr_cpx = _next_cpx() elif svu.is_smaller_chrom(curr_record.chrom, curr_cpx.chrom): yield curr_record curr_record = _next_record() else: yield curr_cpx curr_cpx = _next_cpx() # At least one iterator is exhausted, return rest of other iterator, if any while curr_record is not None: yield curr_record curr_record = _next_record() while curr_cpx is not None: yield curr_cpx curr_cpx = _next_cpx()
def standardize_info(self, std_rec, raw_rec): """ Standardize Lumpy record. 1) Add CHR2, END """ std_rec.info['SVTYPE'] = raw_rec.info['SVTYPE'] # Strip per-strand counts std_rec.info['STRANDS'] = raw_rec.info['STRANDS'][0].split(':')[0] # Parse CHR2 and END if std_rec.info['SVTYPE'] == 'BND': chr2, end = parse_bnd_pos(std_rec.alts[0]) # swap chr2/chrom, pos/end, and reverse strandedness if not is_smaller_chrom(std_rec.chrom, chr2): std_rec.pos, end = end, std_rec.pos std_rec.chrom, chr2 = chr2, std_rec.chrom std_rec.info['STRANDS'] = std_rec.info['STRANDS'][::-1] else: chr2, end = raw_rec.chrom, raw_rec.stop std_rec.info['CHR2'] = chr2 std_rec.stop = end # Add SVLEN if std_rec.chrom == std_rec.info['CHR2']: std_rec.info['SVLEN'] = end - std_rec.pos else: std_rec.info['SVLEN'] = -1 std_rec.info['ALGORITHMS'] = ['smoove'] return std_rec
def standardize_info(self, std_rec, raw_rec): """ Standardize Delly record. 1) Rename 'TRA' to 'BND'. 2) Swap CHROM and CHR2 in translocations. 3) Add END. 4) Rename 'CT' to 'STRANDS' and convert notation. 5) Compute SVLEN. 6) Add ALGORITHMS. 7) Standardize ALT to VCF spec. """ # Rename TRA to BND svtype = raw_rec.info['SVTYPE'] if svtype == 'TRA': svtype = 'BND' std_rec.info['SVTYPE'] = svtype END = std_rec.stop if "<" not in std_rec.alts[0]: std_rec.alts = ('<' + svtype + '>', ) std_rec.ref = "N" std_rec.stop = END # Convert strandedness notation raw_strands = raw_rec.info['CT'] if raw_strands == '5to3': strands = '-+' elif raw_strands == '3to3': strands = '++' elif raw_strands == '5to5': strands = '--' elif raw_strands == '3to5': strands = '+-' elif raw_strands == 'NtoN' and svtype == 'INS': strands = '+-' else: msg = 'Improper strands ({0}) in record {1}' raise Exception(msg.format(raw_strands, raw_rec.id)) std_rec.info['STRANDS'] = strands pos, end = raw_rec.pos, raw_rec.stop if pos == 0: pos = 1 # Swap CHR2/CHROM if necessary and update ALT if svtype == 'BND': chrom, chr2 = raw_rec.chrom, raw_rec.info['CHR2'] # swap chr2/chrom, pos/end, and reverse strandedness if not is_smaller_chrom(chrom, chr2): if end == 0: end = 1 std_rec.pos, end = end, pos std_rec.chrom, chr2 = chr2, chrom std_rec.info['STRANDS'] = strands[::-1] else: chr2 = raw_rec.chrom # Add CHR2 and END std_rec.stop = end std_rec.info['CHR2'] = chr2 # Add SVLEN if std_rec.chrom == std_rec.info['CHR2']: std_rec.info['SVLEN'] = std_rec.stop - std_rec.pos else: std_rec.info['SVLEN'] = -1 std_rec.info['ALGORITHMS'] = ['delly'] return std_rec
def __lt__(self, other): if self.tup[0] == other.tup[0]: return int(self.tup[1]) < int(other.tup[1]) else: return is_smaller_chrom(self.tup[0], other.tup[0])
def __lt__(self, other): if self.record.chrom == other.record.chrom: return self.record.pos < other.record.pos else: return svu.is_smaller_chrom(self.record.chrom, other.record.chrom)
def standardize_info(self, std_rec, raw_rec): """ Standardize Manta record. 1) Replace colons in ID with underscores (otherwise breaks VCF parsing) 2) Define CHR2 and END 3) Add strandedness 4) Add SVLEN """ # Colons in the ID can break parsing std_rec.id = std_rec.id.replace(':', '_') svtype = raw_rec.info['SVTYPE'] std_rec.info['SVTYPE'] = svtype # Define CHR2 and END if svtype == 'BND': chr2, end = parse_bnd_pos(raw_rec.alts[0]) chrom, pos = raw_rec.chrom, raw_rec.pos if not is_smaller_chrom(chrom, chr2): pos, end = end, pos chrom, chr2 = chr2, chrom std_rec.pos = pos std_rec.chrom = chrom elif svtype == 'INS': chr2 = raw_rec.chrom end = raw_rec.pos + 1 else: chr2 = raw_rec.chrom end = raw_rec.stop std_rec.info['CHR2'] = chr2 std_rec.stop = end # Strand parsing if svtype == 'INV': if 'INV3' in raw_rec.info.keys(): strands = '++' else: strands = '--' elif svtype == 'BND': strands = parse_bnd_strands(raw_rec.alts[0]) elif svtype == 'DEL': strands = '+-' elif svtype == 'DUP': strands = '-+' elif svtype == 'INS': strands = '+-' if not is_smaller_chrom(std_rec.chrom, std_rec.info['CHR2']): strands = strands[::-1] std_rec.info['STRANDS'] = strands if svtype == 'BND' and std_rec.chrom != std_rec.info['CHR2']: std_rec.info['SVLEN'] = -1 elif svtype == 'INS': std_rec.info['SVLEN'] = raw_rec.info.get('SVLEN', -1) else: std_rec.info['SVLEN'] = std_rec.stop - std_rec.pos std_rec.info['ALGORITHMS'] = ['manta'] return std_rec