def as_indel(self, ref_fasta): chrom = self.chroms[0].lstrip('chr') pos = self.breaks[0] ref = alt = None size = self.get_size() if self.rearrangement == 'del': ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1] - 1).upper() alt = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper() else: ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1]).upper() alt = ref + self.novel_seq.upper() id = self.id qual = '.' filter = '.' info = { 'BKPTID': ','.join(self.contigs), } # read support if self.final_support is not None: #info['READSUPPORT'] = self.final_support info['SPANNING_READS'] = self.support['spanning'] # somatic if self.somatic: info['SOMATIC'] = 'SOMATIC' # repeat contraction if self.rearrangement == 'del' and self.repeat_seq is not None: if self.repeat_seq is not None: info['REPEAT_SEQ'] = self.repeat_seq if self.repeat_num is not None: info['REPEAT_NUM'] = self.repeat_num if self.repeat_num_change is not None: info['REPEAT_NUM_CHANGE'] = self.repeat_num_change if ref is not None and alt is not None: fields = [ chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info) ] return '\t'.join(map(str, fields))
def as_indel(self, ref_fasta): chrom = self.chroms[0].lstrip('chr') pos = self.breaks[0] ref = alt = None size = self.get_size() if self.rearrangement == 'del': ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1] - 1).upper() alt = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper() else: ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1]).upper() alt = ref + self.novel_seq.upper() id = self.id qual = '.' filter = '.' info = { 'BKPTID':','.join(self.contigs), } # read support if self.final_support is not None: #info['READSUPPORT'] = self.final_support info['SPANNING_READS'] = self.support['spanning'] # somatic if self.somatic: info['SOMATIC'] = 'SOMATIC' # repeat contraction if self.rearrangement == 'del' and self.repeat_seq is not None: if self.repeat_seq is not None: info['REPEAT_SEQ'] = self.repeat_seq if self.repeat_num is not None: info['REPEAT_NUM'] = self.repeat_num if self.repeat_num_change is not None: info['REPEAT_NUM_CHANGE'] = self.repeat_num_change if ref is not None and alt is not None: fields = [chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info)] return '\t'.join(map(str, fields))
def as_sv(self, ref_fasta, id_ext=None, info_ext=None, chrom_ext=None, pos_ext=None): chrom = self.chroms[0] if chrom_ext is None else chrom_ext pos = self.breaks[0] if pos_ext is None else pos_ext chrom = chrom.lstrip('chr') alt = None ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper() sv_len = self.get_size() end = None if type(sv_len) is int: end = pos + sv_len if self.rearrangement == 'del': alt = '<DEL>' sv_type = 'DEL' if type(sv_len) is int: sv_len = -1 * sv_len end = pos - sv_len elif self.rearrangement == 'dup': alt = '<DUP:TANDEM>' sv_type = 'DUP' elif self.rearrangement == 'inv': alt = '<INV>' sv_type = 'INV' elif self.rearrangement == 'ins': alt = '<INS>' sv_type = 'INS' end = pos id = self.id if id_ext is None else id_ext qual = '.' filter = '.' info = {'SVTYPE': sv_type, 'END': end, 'BKPTID':','.join(self.contigs), } if end is not None: info['END'] = end if type(sv_len) is int: info['SVLEN'] = sv_len if sv_type == 'DUP': if self.repeat_seq is not None: info['REPEAT_SEQ'] = self.repeat_seq if self.repeat_num is not None: info['REPEAT_NUM'] = self.repeat_num if self.repeat_num_change is not None: info['REPEAT_NUM_CHANGE'] = self.repeat_num_change # read support if self.final_support is not None: #info['READSUPPORT'] = self.final_support info['SPANNING_READS'] = self.support['spanning'] if self.support['flanking'] is not None: info['FLANKING_PAIRS'] = self.support['flanking'] # somatic if self.somatic: info['SOMATIC'] = 'SOMATIC' cipos = None homol_len = None homol_seq = None if self.homol_seq and self.homol_seq[0] != '-': homol_seq = self.homol_seq[0].upper() homol_len = len(self.homol_seq[0]) contig_breaks = self.contig_breaks[0] # e.g. GMAP if contig_breaks[0] + 1 == contig_breaks[1]: #print 'gmap', contig_breaks pass # e.g. BWA-mem elif contig_breaks[0] >= contig_breaks[1]: cipos = '0,%d' % homol_len if cipos is not None: info['CIPOS'] = cipos info['CIPOS'] = cipos if homol_len is not None: info['HOMLEN'] = homol_len info['HOMLEN'] = homol_len if homol_seq is not None: info['HOMSEQ'] = homol_seq info['HOMSEQ'] = homol_seq # external info - overrides given info if info_ext: for key, value in info_ext.iteritems(): if key == 'SVLEN' and value == 'NA': continue info[key] = value if ref is not None and alt is not None: fields = [chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info)] return '\t'.join(map(str, fields))
def as_breakends(self, ref_fasta, genomic=True, max_novel_seq_len=50, info_ext=None, parids=None, event=None): chroms = map(lambda c: c.lstrip('chr'), self.chroms) alt_chroms = chroms[:] pos = list(self.breaks) alt_pos = pos[:] # inserted novel sequences inserted_seqs = ['',''] if self.novel_seq and self.novel_seq != 'NA' and self.novel_seq != '-': if len(self.novel_seq) > max_novel_seq_len: alt_chroms[0] = '<%s>' % self.contigs[0] alt_chroms[1] = '<%s>' % self.contigs[0] alt_pos[1] = self.contig_breaks[0][0] + 1 alt_pos[0] = self.contig_breaks[0][1] - 1 else: if len(self.aligns[0]) == 1: inserted_seqs[0] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq) inserted_seqs[1] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq) else: inserted_seqs[0] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq) inserted_seqs[1] = self.novel_seq if self.aligns[0][1].strand == '+' else reverse_complement(self.novel_seq) # microhomology, cipos cipos = None homol_len = None homol_seq = None if self.homol_seq and self.homol_seq[0] != '-' and len(self.homol_seq) > 0: homol_seq = self.homol_seq[0].upper() homol_len = len(self.homol_seq[0]) contig_breaks = self.contig_breaks[0] # e.g. GMAP if contig_breaks[0] + 1 == contig_breaks[1]: pass # e.g. BWA-mem elif contig_breaks[0] >= contig_breaks[1]: pos[0] -= homol_len alt_pos[1] += homol_len cipos = '0,%d' % homol_len refs = (ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper(), ref_fasta.fetch(self.chroms[1], self.breaks[1] - 1, self.breaks[1]).upper()) ids = ('%s%s' % (self.id, 'a'), '%s%s' % (self.id, 'b')) svtype = 'BND' if genomic else 'FND' infos = [{'SVTYPE':svtype, 'MATEID':ids[1], 'EVENTTYPE':self.rearrangement.upper()}, {'SVTYPE':svtype, 'MATEID':ids[0], 'EVENTTYPE':self.rearrangement.upper()}] if cipos is not None: infos[0]['CIPOS'] = cipos infos[1]['CIPOS'] = cipos if homol_len is not None: infos[0]['HOMLEN'] = homol_len infos[1]['HOMLEN'] = homol_len if homol_seq is not None: infos[0]['HOMSEQ'] = homol_seq infos[1]['HOMSEQ'] = homol_seq # read support if self.final_support is not None: #infos[0]['READSUPPORT'] = self.final_support #infos[1]['READSUPPORT'] = self.final_support infos[0]['SPANNING_READS'] = self.support['spanning'] infos[1]['SPANNING_READS'] = self.support['spanning'] if self.support['flanking'] is not NONE: infos[0]['FLANKING_PAIRS'] = self.support['flanking'] infos[1]['FLANKING_PAIRS'] = self.support['flanking'] adj_size = self.get_size() if type(adj_size) is int: infos[0]['SVLEN'] = adj_size infos[1]['SVLEN'] = adj_size # somatic if self.somatic: infos[0]['SOMATIC'] = 'SOMATIC' infos[1]['SOMATIC'] = 'SOMATIC' # contig and contig breakpoints if self.contigs: for i in range(2): infos[i]['BKPTID'] = ','.join(self.contigs) if self.contig_breaks and len(self.contig_breaks) == len(self.contigs): contig_breaks = [] for bk in self.contig_breaks: if len(bk) == 2: contig_breaks.append('%s-%s' % (bk[0], bk[1])) else: print 'error' if len(contig_breaks) == len(self.contigs): for i in range(2): infos[i]['CTG_BKS'] = ','.join(contig_breaks) # external info - overrides given info if info_ext: for key, value in info_ext.iteritems(): if len(value) == 2: infos[0][key] = value[0] infos[1][key] = value[1] if self.orients[0] == 'L': # LL if self.orients[1] == 'L': alts = ('%s%s]%s:%s]' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]), '%s%s]%s:%s]' % (refs[1], inserted_seqs[1], alt_chroms[0], alt_pos[0])) # LR else: alts = ('%s%s[%s:%s[' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]), ']%s:%s]%s%s' % (alt_chroms[0], alt_pos[0], inserted_seqs[1], refs[1])) else: # RL if self.orients[1] == 'L': alts = (']%s:%s]%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]), '%s%s[%s:%s[' % (refs[1], inserted_seqs[1], chroms[0], alt_pos[0])) # RR else: alts = ('[%s:%s[%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]), '[%s:%s[%s%s' % (chroms[0], alt_pos[0], inserted_seqs[1], refs[1])) breakends = map(lambda i: '\t'.join([chroms[i], str(pos[i]), ids[i], refs[i], alts[i], '.', '.', VCF.info_dict_to_str(infos[i])]), range(2)) return '\n'.join(breakends)
def as_sv(self, ref_fasta, id_ext=None, info_ext=None, chrom_ext=None, pos_ext=None): chrom = self.chroms[0] if chrom_ext is None else chrom_ext pos = self.breaks[0] if pos_ext is None else pos_ext chrom = chrom.lstrip('chr') alt = None ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper() sv_len = self.get_size() end = None if type(sv_len) is int: end = pos + sv_len if self.rearrangement == 'del': alt = '<DEL>' sv_type = 'DEL' if type(sv_len) is int: sv_len = -1 * sv_len end = pos - sv_len elif self.rearrangement == 'dup': alt = '<DUP:TANDEM>' sv_type = 'DUP' elif self.rearrangement == 'inv': alt = '<INV>' sv_type = 'INV' elif self.rearrangement == 'ins': alt = '<INS>' sv_type = 'INS' end = pos id = self.id if id_ext is None else id_ext qual = '.' filter = '.' info = { 'SVTYPE': sv_type, 'END': end, 'BKPTID': ','.join(self.contigs), } if end is not None: info['END'] = end if type(sv_len) is int: info['SVLEN'] = sv_len if sv_type == 'DUP': if self.repeat_seq is not None: info['REPEAT_SEQ'] = self.repeat_seq if self.repeat_num is not None: info['REPEAT_NUM'] = self.repeat_num if self.repeat_num_change is not None: info['REPEAT_NUM_CHANGE'] = self.repeat_num_change # read support if self.final_support is not None: #info['READSUPPORT'] = self.final_support info['SPANNING_READS'] = self.support['spanning'] if self.support['flanking'] is not None: info['FLANKING_PAIRS'] = self.support['flanking'] # somatic if self.somatic: info['SOMATIC'] = 'SOMATIC' cipos = None homol_len = None homol_seq = None if self.homol_seq and self.homol_seq[0] != '-': homol_seq = self.homol_seq[0].upper() homol_len = len(self.homol_seq[0]) contig_breaks = self.contig_breaks[0] # e.g. GMAP if contig_breaks[0] + 1 == contig_breaks[1]: #print 'gmap', contig_breaks pass # e.g. BWA-mem elif contig_breaks[0] >= contig_breaks[1]: cipos = '0,%d' % homol_len if cipos is not None: info['CIPOS'] = cipos info['CIPOS'] = cipos if homol_len is not None: info['HOMLEN'] = homol_len info['HOMLEN'] = homol_len if homol_seq is not None: info['HOMSEQ'] = homol_seq info['HOMSEQ'] = homol_seq # external info - overrides given info if info_ext: for key, value in info_ext.iteritems(): if key == 'SVLEN' and value == 'NA': continue info[key] = value if ref is not None and alt is not None: fields = [ chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info) ] return '\t'.join(map(str, fields))
def as_breakends(self, ref_fasta, genomic=True, max_novel_seq_len=50, info_ext=None, parids=None, event=None): chroms = map(lambda c: c.lstrip('chr'), self.chroms) alt_chroms = chroms[:] pos = list(self.breaks) alt_pos = pos[:] # inserted novel sequences inserted_seqs = ['', ''] if self.novel_seq and self.novel_seq != 'NA' and self.novel_seq != '-': if len(self.novel_seq) > max_novel_seq_len: alt_chroms[0] = '<%s>' % self.contigs[0] alt_chroms[1] = '<%s>' % self.contigs[0] alt_pos[1] = self.contig_breaks[0][0] + 1 alt_pos[0] = self.contig_breaks[0][1] - 1 else: if len(self.aligns[0]) == 1: inserted_seqs[0] = self.novel_seq if self.aligns[0][ 0].strand == '+' else reverse_complement( self.novel_seq) inserted_seqs[1] = self.novel_seq if self.aligns[0][ 0].strand == '+' else reverse_complement( self.novel_seq) else: inserted_seqs[0] = self.novel_seq if self.aligns[0][ 0].strand == '+' else reverse_complement( self.novel_seq) inserted_seqs[1] = self.novel_seq if self.aligns[0][ 1].strand == '+' else reverse_complement( self.novel_seq) # microhomology, cipos cipos = None homol_len = None homol_seq = None if self.homol_seq and self.homol_seq[0] != '-' and len( self.homol_seq) > 0: homol_seq = self.homol_seq[0].upper() homol_len = len(self.homol_seq[0]) contig_breaks = self.contig_breaks[0] # e.g. GMAP if contig_breaks[0] + 1 == contig_breaks[1]: pass # e.g. BWA-mem elif contig_breaks[0] >= contig_breaks[1]: pos[0] -= homol_len alt_pos[1] += homol_len cipos = '0,%d' % homol_len refs = (ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper(), ref_fasta.fetch(self.chroms[1], self.breaks[1] - 1, self.breaks[1]).upper()) ids = ('%s%s' % (self.id, 'a'), '%s%s' % (self.id, 'b')) svtype = 'BND' if genomic else 'FND' infos = [{ 'SVTYPE': svtype, 'MATEID': ids[1], 'EVENTTYPE': self.rearrangement.upper() }, { 'SVTYPE': svtype, 'MATEID': ids[0], 'EVENTTYPE': self.rearrangement.upper() }] if cipos is not None: infos[0]['CIPOS'] = cipos infos[1]['CIPOS'] = cipos if homol_len is not None: infos[0]['HOMLEN'] = homol_len infos[1]['HOMLEN'] = homol_len if homol_seq is not None: infos[0]['HOMSEQ'] = homol_seq infos[1]['HOMSEQ'] = homol_seq # read support if self.final_support is not None: #infos[0]['READSUPPORT'] = self.final_support #infos[1]['READSUPPORT'] = self.final_support infos[0]['SPANNING_READS'] = self.support['spanning'] infos[1]['SPANNING_READS'] = self.support['spanning'] if self.support['flanking'] is not NONE: infos[0]['FLANKING_PAIRS'] = self.support['flanking'] infos[1]['FLANKING_PAIRS'] = self.support['flanking'] adj_size = self.get_size() if type(adj_size) is int: infos[0]['SVLEN'] = adj_size infos[1]['SVLEN'] = adj_size # somatic if self.somatic: infos[0]['SOMATIC'] = 'SOMATIC' infos[1]['SOMATIC'] = 'SOMATIC' # contig and contig breakpoints if self.contigs: for i in range(2): infos[i]['BKPTID'] = ','.join(self.contigs) if self.contig_breaks and len(self.contig_breaks) == len(self.contigs): contig_breaks = [] for bk in self.contig_breaks: if len(bk) == 2: contig_breaks.append('%s-%s' % (bk[0], bk[1])) else: print 'error' if len(contig_breaks) == len(self.contigs): for i in range(2): infos[i]['CTG_BKS'] = ','.join(contig_breaks) # external info - overrides given info if info_ext: for key, value in info_ext.iteritems(): if len(value) == 2: infos[0][key] = value[0] infos[1][key] = value[1] if self.orients[0] == 'L': # LL if self.orients[1] == 'L': alts = ('%s%s]%s:%s]' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]), '%s%s]%s:%s]' % (refs[1], inserted_seqs[1], alt_chroms[0], alt_pos[0])) # LR else: alts = ('%s%s[%s:%s[' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]), ']%s:%s]%s%s' % (alt_chroms[0], alt_pos[0], inserted_seqs[1], refs[1])) else: # RL if self.orients[1] == 'L': alts = (']%s:%s]%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]), '%s%s[%s:%s[' % (refs[1], inserted_seqs[1], chroms[0], alt_pos[0])) # RR else: alts = ('[%s:%s[%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]), '[%s:%s[%s%s' % (chroms[0], alt_pos[0], inserted_seqs[1], refs[1])) breakends = map( lambda i: '\t'.join([ chroms[i], str(pos[i]), ids[i], refs[i], alts[i], '.', '.', VCF.info_dict_to_str(infos[i]) ]), range(2)) return '\n'.join(breakends)