def get_position_in_ref(self, pos, ref): around_seq = self.seq[pos - 50:pos + 50] around_qual = self.quality[pos - 50:pos + 50] with open('temp_files/temp_fastq.fastq', 'w') as fw: fw.write('@temp\n' + around_seq + '\n+\n' + around_qual) subprocess.run('bwa mem -M -x ont2d -t 7 ' + ref + ' temp_files/temp_fastq.fastq > temp_files/' 'temp.sam', shell=True, stdout=FNULL, stderr=subprocess.STDOUT) with open('temp_files/temp.sam') as f: row = f.readlines()[2].strip().split() if row[2] == '*': return None cigar = row[5] ref_pos = int(row[3]) c = Cigar(cigar) split_cigar = '' for i in c.items(): split_cigar += i[0] * i[1] shift = 0 current = 0 for l in split_cigar: current += 1 if l == 'I': shift -= 1 elif l == 'D': shift += 1 current -= 1 if current == 50: ref_coordinate = ref_pos + 49 + shift break return ref_coordinate
def getGeneLocation(hit, gene): #parse CIGAR string loc = {} loc['note'] = '' list_hit = [] total_len = 0 for h in hit: hit = {} cigar = Cigar(h['cigar']) items = list(cigar.items()) if items[0][1] == 'S' and items[-1][1] == 'S': hit['seq'] = h['seq'][items[0][0]:-items[-1][0]] elif items[0][1] == 'H' and items[-1][1] == 'H': hit['seq'] = h['seq'] else: hit['seq'] = h['seq'] send = int(h['ss']) for item in items: if item[1] == 'M': send = send + int(item[0]) if item[1] == 'D': send = send + int(item[0]) if item[1] == 'I': send = send - int(item[0]) hit['pos'] = int(items[0][0]) hit['ss'] = int(h['ss']) hit['send'] = send list_hit.append(hit) list_hit_sorted = sorted(list_hit, key=lambda k: k['ss']) if not len(list_hit_sorted) > 0: loc['consensus'] = '' loc['hit'] = [] loc['note'] = 'Not found' return loc scafold = [list_hit_sorted[0]] cover_len = 0 for i in range(len(list_hit_sorted)): if list_hit_sorted[i]['ss'] > scafold[-1]['ss'] + len( scafold[-1]['seq']): scafold.append(list_hit_sorted[i]) if len(scafold) < len(list_hit_sorted): loc['note'] = "Multiple sequences found" loc['hit'] = list_hit_sorted loc['pos'] = list_hit_sorted[0]['pos'] for i in range(len(scafold)): cover_len = cover_len + len(scafold[i]['seq']) loc['consensus'] = makeConsensus(scafold, gene) loc['coverage'] = cover_len / len(gene) #make consensus sequence from scafold #print(loc) return loc
def __init__(self, ss, alignmentDataset, sample=1.0, bin_size=10000000): """ Initializes a IndelDistribution class. Computes the insertiona and deletion distribution of alignmentDataset. Args: :param SparkSession: the global SparkSession :param alignmentDataset: A bdgenomics.adam.dataset.AlignmentDataset object :param bin_size: Division size per bin """ bin_size = int(bin_size) self.bin_size = bin_size self.sc = ss.sparkContext self.sample = sample # filter alignments without a position filteredAlignments = alignmentDataset.transform(lambda x: x.sample(False, self.sample)) \ .transform(lambda x: x.filter(x["start"] >= 0)) # Assign alignments with counter for contigs. Reduce and collect. mappedDistributions = filteredAlignments.toDF().rdd \ .map(lambda r: ((r["referenceName"], r["start"] - r["start"]%bin_size), \ Counter(dict([(y,x) for x,y in Cigar(r["cigar"]).items()])))) \ .reduceByKey(lambda x,y: x+y) self.alignments = mappedDistributions.collect()
def get_max_clip_len(read): """Returns the longest clipped portion of a read, to the left or to the right.""" if not read.cigarstring: raise ValueError("Missing Cigar string") cigar_tuples = list(Cigar(read.cigarstring).items()) clip_lengths = [cigar_tuples[i][0] for i in (0, -1) if cigar_tuples[i][1] not in Cigar.ref_consuming_ops] return max(clip_lengths) if clip_lengths else 0
def get_sa_attributes(sa_tag): ''' An entry in the SA tag consist of rname, POS, strand, CIGAR, mapQ, NM Returned are POS, strand and aligned length (based on cigar) ''' sasplit = sa_tag.split(',') start = int(sasplit[1]) end = start + Cigar(sasplit[3]).reference_length() strand = sasplit[2] return start, end, strand
def cigar_to_list(cigar): """Convencience function that converts list, string or cigarstruct cigar to list of cigarstruct items """ if isinstance(cigar, list): cig = cigar elif not isinstance(cigar, Cigar): cig = list(Cigar(cigar).items()) else: cig = list(cigar) return cig
def get_splices(lines): parted = (line.split() for line in lines) spliced = (p for p in parted if "N" in p[5]) for parts in spliced: chrom = parts[2] direction = "-" if (16 & int(parts[1])) else "+" cigar = list(Cigar(parts[5]).items()) cur_pos = int(parts[3]) - 1 for d, code in cigar: if code == "N": yield chrom, cur_pos, cur_pos + d, direction if code in consuming: cur_pos += d
def make_split_read(read, breakpoint, clip_left, hard_clip_threshold=1.0, sequence=None): """ Create a split read (a continuous soft-clip from one end of a read until a breakpoint). Modifies both the CIGAR string and the actual sequence of the read. For example, If the read sequence is `ACACACAC` with a CIGAR of 8M, the breakpoint in position 3, and the sequence provided is `GTGTGT`, then if the clipping is a to the left of the breakpoint the modified read will have a CIGAR of 3S5M and its sequence will be `GTGCACAC`, and if the clipping is to the right of the breakpoint, the modified read will have a CIGAR of 3M5S and its sequence will be `ACAGTGTG`. Args: read: The read to modify. breakpoint: The breakpoint of the read. clip_left: Whether to clip every base to the left of the breakpoint or to the right of it. hard_clip_threshold: By default bases are soft-clipped. If more than `hard_clip_threshold` of the read is clipped, hard-clip instead sequence: An optional sequence to use for overriding bases in the clipped region. Returns: The split read. """ split_read = copy.deepcopy(read) split_read.qname = split_read.query_name = read.qname + '-' + 'split' # CIGAR clipping. if read.cigarstring: cigar = Cigar(read.cigarstring) split_read.cigarstring = str(cigar.mask_left(breakpoint) if clip_left else cigar.mask_right(read.rlen - breakpoint)) # Convert to hard-clipping, if needed. clip_len = breakpoint if clip_left else read.rlen - breakpoint if float(clip_len) / read.rlen > hard_clip_threshold: soft_clipped_cigar = '{}S'.format(clip_len) hard_clipped_cigar = '{}H'.format(clip_len) cigar = split_read.cigarstring if clip_left and cigar.startswith(soft_clipped_cigar): cigar = cigar.replace(soft_clipped_cigar, hard_clipped_cigar, 1) elif not clip_left and split_read.cigarstring.endswith(soft_clipped_cigar): cigar = cigar[:-len(hard_clipped_cigar)] + hard_clipped_cigar split_read.cigarstring = cigar if clip_left: # adjust reference match. split_read.reference_start += breakpoint # Sequence replacement. if sequence: split_seq = list(split_read.seq) if clip_left: split_seq[:breakpoint] = sequence[-breakpoint:] else: split_seq[breakpoint:] = sequence[:read.rlen - breakpoint] qual = split_read.qual split_read.seq = ''.join(split_seq) qual += DEFAULT_QUAL * (len(split_seq) - len(qual)) split_read.qual = qual[:len(split_seq)] return split_read
def alignment_length_cigar(CIGAR): ''' Compute alignment on the reference length from CIGAR string Input: 1. CIGAR: CIGAR string Output: 1. alignmentLen: alignment on the reference length ''' ## 1. Read CIGAR string using proper module cigarTuples = Cigar(CIGAR) ## 2. Iterate over the operations and compute the alignment length alignmentLen = 0 for cigarTuple in list(cigarTuples.items()): length = int(cigarTuple[0]) operation = cigarTuple[1] ### Update reference alignment length ## a) Operations consuming query and reference # - Op M, tag 0, alignment match (can be a sequence match or mismatch) # - Op =, tag 7, sequence match # - Op X, tag 8, sequence mismatch if (operation == 'M') or (operation == '=') or (operation == 'X'): alignmentLen += length ## b) Operations only consuming reference # - Op D, tag 2, deletion from the reference # - Op N, tag 3, skipped region from the reference elif (operation == 'D') or (operation == 'N'): alignmentLen += length return alignmentLen
def rna_bam_to_bed(lines): for line in lines: parts = line.split() direction = "-" if 16 & int(parts[1]) else "+" cigar = list(Cigar(parts[5]).items()) start_pos = int(parts[3]) - 1 cur_pos = start_pos for d, code in cigar: if code == "N": yield (parts[2], start_pos, cur_pos, direction, 1) start_pos = cur_pos + d if code in consuming: cur_pos += d yield (parts[2], start_pos, cur_pos, direction, 0)
def parseCIGARForIntrons(cigar): """ Parses a CIGAR string and returns values which can used to determine an intron's 3' and 5' splice sites Args: cigar, a CIGAR string with an intron in it E.x. cigar='3M1D40M20N' Returns: offset, a figure which accomodates for insertion and deletion events to adjust an alignment's positions back to the reference genome matchedExon, a figure to be added to the start position of an alignment which forms the 5' end of a splice site. This function only considers 'M''s before an intron (N) for this figure. intronLength, the length of an intron as reported by the CIGAR string. This figure is added with matchedExon and the start of an alignment to produce the position of the 3' end of a splice site. Raises: None """ if 'N' in cigar: cigar = cigar.split('N')[0] + 'N' #remove all information after intron else: raise Exception('No intron detected') offset = 0 matchedExon = 0 intronLength = 0 for c in list( Cigar(cigar).items()): # returns list of tuples : [(20, 'N')] if c[1] == 'N': intronLength += int(c[0]) elif c[1] == 'D': offset += int(c[0]) elif c[1] == 'I': offset -= int(c[0]) elif c[1] == 'M': matchedExon += int(c[0]) ## soft clipping is ignored ## hard clipping is ignored too return offset, matchedExon, intronLength
def modify_sequences(main_dict, bed_file): from cigar import Cigar from Bio.Seq import Seq for i in open(bed_file, 'r'): line = i.split('\t') chr1 = line[0] if chr1 == '#chr1': continue chr2 = line[3] if chr1 == 'chrM' or chr2 == 'chrM': continue s1 = int(line[1]) e1 = int(line[2]) s2 = int(line[4]) e2 = int(line[5]) cigar = line[12] seq1 = main_dict[chr1][s1:e1] seq2 = main_dict[chr2][s2:e2] if line[9] == '+' else str( Seq(main_dict[chr2][s2:e2]).reverse_complement()) seq_new = '' counter_1 = 0 counter_2 = 0 for num, let in Cigar(cigar).items(): # print (let, num) if let == 'M': seq_new += seq1[counter_1:counter_1 + num] # if seq1[counter_1: counter_1+num] != seq2[counter_2: counter_2+num]: # print (seq1[counter_1: counter_1+num]) # print (seq2[counter_2: counter_2+num]) counter_1 += num counter_2 += num elif let == 'D': counter_1 += num elif let == 'I': seq_new += seq2[counter_2:counter_2 + num] counter_2 += num if line[9] == '-': seq_new = str(Seq(seq_new).reverse_complement()) main_dict[chr2] = main_dict[chr2][:s2] + seq_new + main_dict[chr2][e2:] # print (len(seq_new), len(seq2)) # print (seq_new) # print (seq2) assert len(seq_new) == len(seq2)
def __init__(self, ss, alignmentRDD, sample=1.0, bin_size=10000000): """ Initializes a AlignmentDistribution class. Computes the alignment distribution of multiple coverageRDDs. :param ss: Spark Object :param alignmentRDDs: A list of bdgenomics.adam.rdd.AlignmentRDD objects :param int bin_size: Division size per bin """ self.bin_size = int(bin_size) self.sc = ss.sparkContext # filter alignments without a position filteredAlignments = alignmentRDD.transform(lambda x: x.sample(False, sample)) \ .toDF().rdd.filter(lambda r: r["start"] != None) # Assign alignments with counter for contigs. Reduce and collect. mappedDistributions = filteredAlignments \ .map(lambda r: ((r["contigName"], r["start"] - r["start"]%bin_size), \ Counter(dict([(y,x) for x,y in Cigar(r["cigar"]).items()])))) \ .reduceByKey(lambda x,y: x+y) self.alignments = mappedDistributions.collect()
def test_make_split_read_bam_file(self): sorted_bam = path.join(TEST_DATA_DIR, 'sorted.bam') with pysam.Samfile(sorted_bam, 'rb') as samfile: for read in samfile: if not read.cigarstring: continue for breakpoint in (10, 50, 100): if breakpoint >= read.rlen: continue for is_left_split in (True, False): split_read = make_split_read(read, breakpoint, is_left_split) cigar_items = list( Cigar(split_read.cigarstring).items()) clipped_item = cigar_items[ 0] if is_left_split else cigar_items[-1] min_clip_len = breakpoint if is_left_split else read.rlen - breakpoint # Can be longer if adjacent to another clip. self.assertGreaterEqual(clipped_item[0], min_clip_len) self.assertIn( clipped_item[1], ('S', 'H') ) # Will be soft-clipped unless already hard-clipped.
def parseCIGARForIntrons(cigar): if 'N' in cigar: cigar = cigar.split('N')[0] + 'N' #remove all information after intron else: raise Exception('no intron detected') offset = 0 matchedExon = 0 intronLength = 0 for c in list(Cigar(cigar).items()): # returns list of tuples : [(20, 'N')] if c[1] == 'N': intronLength += int(c[0]) elif c[1] == 'D': offset += int(c[0]) elif c[1] == 'I': offset -= int(c[0]) elif c[1] == 'M': matchedExon += int(c[0]) ## soft clipping is ignored ## hard clipping is ignored too return offset, matchedExon, intronLength
l = line.split("\t") readID = l[0] chrName = l[2] filterDict[readID] = [] filterDict[readID].append(chrName) filterDict[readID] = set(filterDict[readID]) samFile.close() samFile2 = open("QH046cDNA.sam", "r") for line2 in samFile2: if line2.startswith("@"): print line2 else: l2 = line2.split("\t") readID2 = l[0] if len(filterDict[readID2]) > 2: continue else: seqLength = len(l2[9]) cigar = Cigar(l2[5]) cigarList = list(cigar.items()) mapp = 0 for i in cigarList: if i[1] == "M": mapp += i[0] else: continue if seqLength - mapp <= 1: print line2 samFile2.close()
def alignment_interval_query(CIGAR, orientation): ''' Compute alignment on the reference length from CIGAR string Input: 1. CIGAR: CIGAR string 2. orientation: alignment orientation (+ or -) Output: 1. beg: begin position in query 2. end: end position in query ''' ## 1. Read CIGAR string using proper module cigar = Cigar(CIGAR) ## 2. Iterate over the operations and compute query alignment length and start position in query alignmentLen = 0 counter = 0 # Count operations for cigarTuple in list(cigar.items()): length = int(cigarTuple[0]) operation = cigarTuple[1] ## Set start position in query based on first operation if counter == 0: # a) Soft or Hard clipping if (operation == 'S') or (operation == 'H'): startPos = length # b) No clipping else: startPos = 0 #### Update query alignment length # - Op M, alignment match (can be a sequence match or mismatch) # - Op =, sequence match # - Op X, sequence mismatch # - Op I, insertion to the reference if (operation == 'M') or (operation == '=') or (operation == 'X') or (operation == 'I'): alignmentLen += length ## Update operations counter counter += 1 ## 3. Compute alignment interval in raw query ## Compute read length readLen = len(cigar) # a) Query aligned in + if orientation == '+': beg = startPos end = startPos + alignmentLen # b) Query aligned in - (reversed complemented to align) else: beg = readLen - startPos - alignmentLen end = readLen - startPos return beg, end
filter_df['CIGAR_str'] = CIGAR_list filter_df['CIGAR_list'] = C_list filter_df['read_len'] = len_list filter_df['start_coo'] = ref_st_list filter_df['loc'] = filter_df.index filter_df = filter_df.sort_values(['Score'], ascending=[False]) filter_df = filter_df.drop_duplicates(subset=['Read_Name'], keep='first') CIG_dict = dict(zip(filter_df['CIGAR_list'], filter_df['Read_Name'])) COO_dict = dict(zip(filter_df['CIGAR_list'], filter_df['start_coo'])) read_len_dict = dict(zip(filter_df['CIGAR_list'], filter_df['read_len'])) for c_str in filter_df['CIGAR_list']: CIGAR_edit = Cigar(c_str) l = len(CIGAR_edit) CIG_list = list(CIGAR_edit.items()) for index, tup in enumerate(CIG_list): if CIG_list[0][1] is 'S': CIG_list.remove(CIG_list[0]) else: pass ch = CIG_list[index][1] if ch in String_list: slice_l = CIG_list[:index + 1] slice_m = CIG_list[:index]
def invert_read(read, start, end, sequence, snp_rate, indel_rate, max_clip_len=None): """ Invert (a portion of) a read. Args: read: The read to modify. start: The start of the inversion. end: The end of the inversion. sequence: The full sequence that is inverted in the sample that the read belong to. This sequence should be provided in its reverse-complement form (e.g. as returned by `get_inverse_sequence`). snp_rate: The fraction of bases that will be randomly modified in reads that were modified. indel_rate: The fraction of bases that will be randomly inserted or deleted in in reads that were modified. max_clip_len: If more than "max_clip_len" of the read would be clipped on either end, return None, since this read would not have been captured. Returns: A duplicate read to the provided one, where any position covered by the inverted region is replaced with the inversion. """ inv_len = end - start if start >= read.reference_end or end <= read.reference_start or inv_len < 2: return read, 0 read_with_inversion = copy.deepcopy(read) read_with_inversion.qname = read_with_inversion.query_name = read.qname + '-' + 'inv' if read.reference_start <= start < end <= read.reference_end: # Read spans the entire inversion. left_breakpoint = start - read.reference_start right_breakpoint = left_breakpoint + inv_len read_with_inversion.seq = "{left}{inv}{right}".format( left=read.seq[:left_breakpoint], inv="".join(reversed(read.seq[left_breakpoint:right_breakpoint])), right=read.seq[right_breakpoint:]) # Clipped bases in reads must start at a read boundary; choose the closest one. # TODO: add a supplemental/secondary read where the shorter region is matched, and the longer one clipped. cigar_tuples = unpack_cigar(read.cigarstring) if left_breakpoint < read.rlen - right_breakpoint: start_clip, end_clip = 0, right_breakpoint else: start_clip, end_clip = left_breakpoint, read.rlen for i in range(start_clip, end_clip): cigar_tuples[i] = '1S' read_with_inversion.cigarstring = str(Cigar("".join(cigar_tuples)).merge_like_ops()) elif start <= read.reference_start < read.reference_end <= end: # Inversion spans the entire read. pos_in_inversion = read.reference_start - start inv_seq = sequence[pos_in_inversion:pos_in_inversion + read.rlen] read_with_inversion = make_split_read(read_with_inversion, 0, clip_left=False, sequence=inv_seq) # If a read was reversed, modify its strand. read_with_inversion.is_reverse = not read.is_reverse elif start > read.reference_start: # Inversion starts mid-read, continuing to the end of it (or past it). breakpoint = start - read.reference_start read_with_inversion = make_split_read(read_with_inversion, breakpoint, clip_left=False, sequence=sequence) elif end < read.reference_end: # Inversion starts before the read, continuing into it. breakpoint = end - read.reference_start read_with_inversion = make_split_read(read_with_inversion, breakpoint, clip_left=True, sequence=sequence) if max_clip_len and int(max_clip_len) < get_max_clip_len(read_with_inversion): return None, 0 # Add noise. return modify_read(read_with_inversion, snp_rate, indel_rate / 2, indel_rate / 2)
def parse_cigar(aln, qryseq, refseq, cutoff=500): from cigar import Cigar aln = Cigar(aln) lq, rq = 0, 0 lr, rr = 0, 0 refs = [] qrys = [] blks = [] R, Q = {}, {} blkseq = "" blkpos = 0 refmap = [(rr, blkpos - rr)] qrymap = [(rq, blkpos - rq)] def push(qval=None, rval=None): nonlocal R, Q, blkseq, blkpos, refmap, qrymap assert not (qval is None and rval is None) def f(xs, x): if x is None: xs.append(None) return True else: l, r = zip(x) if l < r: xs.append(x) return True return False hasq = f(qrys, qval) hasr = f(refs, rval) if hasq or hasr: assert len(qrys) == len(refs) assert len(blkseq) > 0, "empty seq" blks.append((np.array(list(blkseq)), (Q, np.array(qrymap).T), (R, np.array(refmap).T))) R, Q = {}, {} blkseq = "" blkpos = 0 refmap = [(rr, blkpos - rr)] qrymap = [(rq, blkpos - rq)] def recordbp(): nonlocal blkpos, refmap, qrymap blkpos = len(blkseq) refmap.append((rr, blkpos - rr)) qrymap.append((rq, blkpos - rq)) for l, t in aln.items(): if t in ['S', 'H']: if l >= cutoff: print(aln) import ipdb ipdb.set_trace() push((lq, rq), (lr, rr)) blkseq = qryseq[rq:rq + l] # TODO: Think through soft/hard clips # if t == 'S': rq += l recordbp() push((rq - l, rq), None) lq = rq lr = rr else: rq += l recordbp() elif t == 'M': rs = np.array(list(refseq[rr:rr + l])) qs = np.array(list(qryseq[rq:rq + l])) diff = np.where(np.array(rs != qs))[0] for i in diff: Q[i + blkpos] = qs[i] blkseq += refseq[rr:rr + l] rq += l rr += l recordbp() elif t == 'D': if l >= cutoff: push((lq, rq), (lr, rr)) blkseq = refseq[rr:rr + l] rr += l recordbp() push(None, (rr - l, rr)) lr = rr lq = rq else: for i in range(l): Q[i + blkpos] = '-' blkseq += refseq[rr:rr + l] rr += l recordbp() elif t == 'I': if l >= cutoff: push((lq, rq), (lr, rr)) blkseq = qryseq[rq:rq + l] rq += l recordbp() push((rq - l, rq), None) lq = rq lr = rr else: for i in range(l): R[i + blkpos] = '-' blkseq += qryseq[rq:rq + l] rq += l recordbp() push((lq, rq), (lr, rr)) assert len(qrys) == len(refs) and len(qrys) == len(blks) return qrys, refs, blks
def main(): parser = argparse.ArgumentParser( description= 'Parse BAM file for multi-alignment and soft-clipped reads. Will return a BAM file containing split reads and a BEDPE file containing the coordinates of the split reads (useful for circos plots, etc.). Can additionally return a filtered methylation TSV file if one is provided (optional)' ) required = parser.add_argument_group( 'Required', 'Bam, clip size to filter on, output location, flags to filter on, splits to filter on, and True/False if alternative chromosomes were used' ) required.add_argument('-b', '--bam', type=str, help='bam file - must be created with NGMLR') required.add_argument('-c', '--clip_size_thresh', type=int, help='soft clip size threshold to filter on [1000]', default=1000) required.add_argument('-o', '--output', type=str, help='output location and prefix') required.add_argument( '-f', '--flag', type=str, help= 'flag(s) to filter bam file on. delimited list, default 256,2046,2304', default='256,2048,2304') required.add_argument( '-s', '--splits', type=int, help= 'Number of splits read aligns to filter on for bedpe file (2 only option right now, hope to change in the future)[2]', default=2) required.add_argument( '-a', '--alt_chroms', type=bool, help= 'Does BAM file use alternative chromosome names? (i.e. NC_000001.11, etc.) [False]', default=False) optional = parser.add_argument_group( 'Optional', 'methylation call tsv file (from f5c)') optional.add_argument( '-m', '--meth', type=str, help='Methylation calls tsv file to filter (from f5c)') args = parser.parse_args() ''' 1. Read in BAM file & extract read ID, flag, and CIGAR string, then filter based on flags ''' inbam = pysam.AlignmentFile(args.bam, "rb") reads = dict() codes = [int(item) for item in args.flag.split(',')] #print(codes) for read in inbam: if read.flag in codes: reads[read.query_name] = read.cigarstring #print(reads) #return(reads) ''' 2. Parsing CIGAR string for left and right soft-clipping ''' clips = defaultdict(dict) for key, value in reads.items(): #print(value) c = Cigar(value) items = list(c.items()) #print(items[-1][1]) if (items[0][1] == "S"): clips[key]["LC"] = int(items[0][0]) else: clips[key]["LC"] = 0 if (items[-1][1] == "S"): clips[key]["RC"] = int(items[-1][0]) else: clips[key]["RC"] = 0 #print(clips) ''' 3. Converting clips nested dict into pd dataframe, filtering on clipping criteria ''' clips_df = pd.DataFrame.from_dict(clips, orient='index') #print(clips_df.head()) clips_df = clips_df[(clips_df['LC'] >= args.clip_size_thresh) | (clips_df['RC'] >= args.clip_size_thresh)] #print(clips_df.head()) ''' 4. Extracting read id's from list above, and creating new BAM file ''' big_clip = list(clips_df.index) outfile = pysam.AlignmentFile(args.output + '_clipped.bam', 'w', template=inbam) inbam = pysam.AlignmentFile( args.bam, "rb") #Always need to re-load bam file for some reason for read in inbam: if read.query_name in big_clip: #print(read) outfile.write(read) ''' 5. Creating bedpe file BEDPE format: chrom1, start1, end1, chrom2, start2, end2 Right now this will only return reads that map to 2 places in the genome, in the future having it enabled for multi-mapping reads would be preferable ''' test_list = [] inbam = pysam.AlignmentFile(args.bam, "rb") for read in inbam: if read.query_name in big_clip: test_list.append(read.query_name) counts = Counter(test_list) unique_reads = [] N = args.splits #Right now this will always be 2, but in the future I'd like to make the program able to identify multi-mapping reads for key, value in counts.items(): #print(key, value) if value == N: unique_reads.append(key) inbam = pysam.AlignmentFile(args.bam, "rb") #print(len(unique_reads)) splits = defaultdict(dict) for read in inbam: if read.query_name in unique_reads: #print(read.query_name) if read.query_name not in splits.keys(): splits[read.query_name]["chromosome"] = read.reference_name splits[read.query_name]["start"] = str(read.reference_start) splits[read.query_name]["end"] = str(read.reference_end) else: splits[ read.query_name]["chromosome"] += "," + read.reference_name splits[read.query_name]["start"] += "," + str( read.reference_start) splits[read.query_name]["end"] += "," + str(read.reference_end) #print(splits) #print(splits) bedpe = pd.DataFrame.from_dict(splits, orient='index') #print(bedpe) bedpe[['chrom1', 'chrom2']] = bedpe['chromosome'].str.split( ',', expand=True, ) bedpe[['start1', 'start2']] = bedpe['start'].str.split( ',', expand=True, ) bedpe[['end1', 'end2']] = bedpe['end'].str.split( ',', expand=True, ) bedpe = bedpe[["chrom1", "start1", "end1", "chrom2", "start2", "end2"]] chr_dict = { "NC_000001.11": "chr1", "NC_000002.12": "chr2", "NC_000003.12": "chr3", "NC_000004.12": "chr4", "NC_000005.10": "chr5", "NC_000006.12": "chr6", "NC_000007.14": "chr7", "NC_000008.11": "chr8", "NC_000009.12": "chr9", "NC_000010.11": "chr10", "NC_000011.10": "chr11", "NC_000012.12": "chr12", "NC_000013.11": "chr13", "NC_000014.9": "chr14", "NC_000015.10": "chr15", "NC_000016.10": "chr16", "NC_000017.11": "chr17", "NC_000018.10": "chr18", "NC_000019.10": "chr19", "NC_000020.11": "chr20", "NC_000021.9": "chr21", "NC_000022.11": "chr22", "NC_000023.11": "chrX", "NC_000024.10": "chrY" } if args.alt_chroms == True: bedpe['chrom1'] = bedpe['chrom1'].map(chr_dict) bedpe['chrom2'] = bedpe['chrom2'].map(chr_dict) #print(bedpe.head()) bedpe.to_csv(args.output + "_split_reads.bedpe", index=False, sep='\t') ''' 6. Optional Methylation filtering ''' if args.meth is not None: meth = pd.read_csv(args.meth, sep='\t') meth = meth[meth.read_name.isin(big_clip)] meth.to_csv(args.output + '_clipped_meth.tsv', index=False, sep='\t')
def find_sa_chimeras( bam, fh_out, sa_singletons_only=False, # anything with dist below min_chim_dist but >0 min_chim_dist=MIN_CHIM_DIST, # anythin below min_chim_dist is called sa_singleton (split alignment singleton) max_nm_perc=100, ign_string=None, ign_max_err=0, add_nm=False): """Find chimeras from BWA-MEM split mappings (defined as chimeras here) """ debug = False if debug: num_chimeras = 0 fh_in = pysam.Samfile(bam) for (num_reads, read) in enumerate(fh_in): if debug and (num_chimeras > 100 or num_reads) > 10000: sys.stderr.write("DEBUG break\n") break if read.is_unmapped or read.is_qcfail or read.is_duplicate: continue if not read_is_primary(read): continue qname = read.qname qseq = read.seq # pysam 0.7.7 uses seq and not query_sequence (clipped anyway?) # pysam 0.7.7 uses qname not query_name if ign_string: if regex.search("({}){{e<={}}}".format(ign_string, ign_max_err), qseq, regex.BESTMATCH): #sys.stderr.write("DEBUG: ignoring {}\n".format(qseq)) continue # previously ignored MQ0. Should be decided downstream. # Still valuable in large genomes. # # if not read.mapping_quality > 0: # pysam 0.7.7 has mapq and not mapping_quality #if not read.mapq > 0: # continue # finding split alignments via SA tag in primary alignment tags = dict(read.tags) if not tags.has_key('SA'): continue assert 'NM' in tags perc_nm = tags['NM'] * 100.0 / float( len(query_aln_seq(read.seq, read.cigarstring))) if perc_nm > max_nm_perc: continue ori_cigar = list(Cigar(read.cigarstring).items()) # no need to determine clip site here and later. overlap/proximity of # mapping determines chimera already # # skip if clip site can't be determined #if not clip_site(ori_cigar): # sys.stderr.write( # "WARN: can't determine clip site (or clip too small) for {} in {}\n".format( # ori_cigar, qname)) # continue # SA == supplementary alignment # for definition of SA tag see: # https://sourceforge.net/p/samtools/mailman/message/30853577/ # chr,strandPos,CIGAR,mapQ,NM; num_valid_sa = 0 for (_sa_num, sa_tag) in enumerate(tags['SA'].rstrip(";").split(';')): sa_tag = dict( zip(['chrom', 'pos', 'strand', 'cigar', 'mq', 'nm'], sa_tag.split(","))) assert sa_tag['strand'] in ['+', '-'] for k in ['pos', 'mq', 'nm']: sa_tag[k] = int(sa_tag[k]) # previously ignored MQ0. Should be decided downstream. # Still valuable in large genomes. #if int(sa_tag['mq']) == 0: # continue sa_cigar = list(Cigar(sa_tag['cigar']).items()) #if not clip_site(sa_cigar): # sys.stderr.write( # "WARN: can't determine clip site (or clip too small) for {} in {} (SA)\n".format( # sa_cigar, qname)) # continue # clips have to be on opposite sites #if clip_site(ori_cigar) == clip_site(sa_cigar): # sys.stderr.write( # "WARN: clip on identical sites for {}: {} and {}\n".format( # qname, ori_cigar, sa_cigar)) # continue perc_nm = sa_tag['nm'] * 100.0 / float( len(query_aln_seq(read.seq, sa_tag['cigar']))) if perc_nm > max_nm_perc: continue # indirect testing of cigar2rlen() # pysam 0.7.7 has aend and not reference_end #assert read.pos + cigar2rlen(ori_cigar) == read.reference_end assert read.pos + cigar2reflen(ori_cigar) == read.aend chim = chimeras2.Chimera() chim.qname = qname # default is to assign primary to left and SA to right and then sort chim.left.flag = read.flag chim.left.rname = fh_in.getrname(read.tid) # pysam 0.7.7 uses tid not reference_id chim.left.pos = read.pos chim.left.aend = read.aend chim.left.mapq = read.mapq chim.left.cigar = read.cigarstring chim.left.seq = query_aln_seq(qseq, chim.left.cigar) if add_nm: chim.left.nm = tags['NM'] chim.right.flag = 0 if sa_tag['strand'] == '+' else 16 chim.right.rname = sa_tag['chrom'] chim.right.pos = sa_tag['pos'] chim.right.aend = sa_tag['pos'] + cigar2reflen(sa_cigar) chim.right.mapq = sa_tag['mq'] chim.right.cigar = sa_tag['cigar'] chim.right.seq = query_aln_seq(qseq, chim.right.cigar) if add_nm: chim.right.nm = sa_tag['nm'] chim.sort_halves() if chim.order == "invalid": # likely revcomp vs non-revcomp continue # no overlap allowed ever if chim.dist2d() < 0: continue # sa_singleton and wanted? if chim.dist2d() < min_chim_dist: if not sa_singletons_only: continue else: if sa_singletons_only: continue #sys.stderr.write( # "DEBUG chim before sanity_check = {} (dist2d={})\n".format( # chim, chim.dist2d())) chim.sanity_check() num_valid_sa += 1 if num_valid_sa > 1: sys.stderr.write( "WARN: More than one valid SA found for {}\n".format( qname)) fh_out.write("{}\n".format(chim)) if debug: num_chimeras += 1 fh_in.close()
idList = list(set(idList)) targetFile1 = open( '%s/%sintermediateFiles/mappedReadPairs_all_bwa.csv_%s' % (sys.argv[1], sys.argv[5], sys.argv[2]), 'a') targetFile2 = open( '%s/%sintermediateFiles/chimericReadPairs_all_bwa.csv_%s' % (sys.argv[1], sys.argv[5], sys.argv[2]), 'a') targetFile2.write( 'readId,R1Tx,R1start,R1end,R1Gene,R1Cigar,R2Tx,R2start,R2end,R2Gene,R2Cigar\n' ) for readId in idList: geneList1 = ';'.join(list(dicReadIdGene1[readId])) geneList2 = ';'.join(list(dicReadIdGene2[readId])) cigar1, cigar2 = Cigar(dicIdtoCigar1[readId][0]), Cigar( dicIdtoCigar2[readId][0]) [txId1, start1, end1] = dicReadIdPos1[readId][0] [txId2, start2, end2] = dicReadIdPos2[readId][0] gene1, gene2 = dicIdGeneName[txId1], dicIdGeneName[txId2] type1, type2 = dicIdGeneType[txId1], dicIdGeneType[txId2] targetFile1.write( '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (readId, gene1, gene2, txId1, start1, end1, str(cigar1), type1, geneList1, txId2, start2, end2, str(cigar2), type2, geneList2)) if len(set(geneList1) & set(geneList2)) == 0: if type1 == 'mRNA' and type2 == 'mRNA': #check cigar string cigar1, cigar2 = Cigar(dicIdtoCigar1[readId][0]), Cigar( dicIdtoCigar2[readId][0])
def adjust_mapping(mapping, overlap_region): # don't adjust the original mapping, adjust a copy. mapping = copy.deepcopy(mapping) debug_original_mapping = mapping[:] # find the cigar & its field from the SAM-liked typed key-value pairs at the end. cig = None cig_field = None for field in range(len(mapping[12:])): field += 12 if mapping[field][:5] == "cg:Z:": cig = col.deque(Cigar(mapping[field][5:]).items()) cig_field = field break # print ("adjusting mapping", cig, "overlap region", overlap_region) ## Note: mapping[X], X=2,3 is query start & end; X=7,8 is target start and end; # 9 is # of M bases; 10 is # of M+I+D bases if mapping[7] < overlap_region[0]: #adjust start of the mapping adjust_amount = overlap_region[0] - mapping[7] while adjust_amount > 0: if cig[0][1] == "M": if cig[0][0] <= adjust_amount: adjust_amount -= cig[0][0] mapping[2] += cig[0][0] mapping[7] += cig[0][0] # mapping[9] -= cig[0][0] mapping[10] -= cig[0][0] cig.popleft() else: # we only need to slightly adjust this cig field, because the adjust amount is less than the cig field. cig[0] = (cig[0][0] - adjust_amount, cig[0][1]) mapping[2] += adjust_amount mapping[7] += adjust_amount # mapping[9] -= adjust_amount mapping[10] -= adjust_amount adjust_amount = 0 elif cig[0][1] == "D": if cig[0][0] <= adjust_amount: adjust_amount -= cig[0][0] mapping[7] += cig[0][0] mapping[10] -= cig[0][0] cig.popleft() else: # we only need to slightly adjust this cig field, because the adjust amount is less than the cig field. cig[0] = (cig[0][0] - adjust_amount, cig[0][1]) mapping[7] += adjust_amount mapping[10] -= adjust_amount adjust_amount = 0 elif cig[0][1] == "I": mapping[2] += cig[0][0] mapping[10] -= cig[0][0] cig.popleft() elif cig[0][1] in "HS": cig.popleft() else: raise ValueError('Inappropiate Cigar value ' + cig[0][1] + ' in cigar ' + mapping[cig_field]) # print ("mapping with adj start", cig, "overlap region", overlap_region) if mapping[8] > overlap_region[1]: #adjust end of the mapping adjust_amount = mapping[8] - overlap_region[1] while adjust_amount > 0: if cig[-1][1] == "M": if cig[-1][0] <= adjust_amount: adjust_amount -= cig[-1][0] mapping[3] -= cig[-1][0] mapping[8] -= cig[-1][0] # mapping[9] -= cig[-1][0] mapping[10] -= cig[-1][0] cig.pop() else: # we only need to slightly adjust this cig field, because the adjust amount is less than the cig field. cig[-1] = (cig[-1][0] - adjust_amount, cig[-1][1]) mapping[3] -= adjust_amount mapping[8] -= adjust_amount mapping[9] -= adjust_amount mapping[10] -= adjust_amount adjust_amount = 0 elif cig[-1][1] == "D": if cig[-1][0] <= adjust_amount: adjust_amount -= cig[-1][0] mapping[8] -= cig[-1][0] mapping[10] -= cig[-1][0] cig.pop() else: # we only need to slightly adjust this cig field, because the adjust amount is less than the cig field. cig[-1] = (cig[-1][0] - adjust_amount, cig[-1][1]) mapping[8] -= adjust_amount mapping[10] -= adjust_amount adjust_amount = 0 elif cig[-1][1] == "I": mapping[3] -= cig[-1][0] mapping[10] -= cig[-1][0] cig.pop() elif cig[-1][1] in "HS": cig.pop() else: raise ValueError('Inappropiate Cigar value ' + cig[-1][1] + ' in cigar ' + mapping[cig_field]) # print ("mapping with adj end", cig, "overlap region", overlap_region) # if mapping[7] == 28596469 or mapping[7] == 17555049: # print("mapping of interest before final adjustment:", mapping) # print("cigar", cig) #todo: consider if there may be no mapping left - just a deletion field. Is it possible to accidentally find an "overlap region" that just shows unmapped reference, i.e. a deletion? This would mean below while-loop would crash eventually. Need to catch that eventuality and have it return a None mapping, instead. while cig[-1][1] in "ID": #while the cigar ends on an insertion/deletion, (which doesn't make biological sense), remove it. if cig[-1][1] == "D": mapping[8] -= cig[-1][0] mapping[10] -= cig[-1][0] cig.pop() elif cig[-1][1] == "I": mapping[3] -= cig[-1][0] mapping[10] -= cig[-1][0] cig.pop() # print ("dropped all in 'ID' in end.", cig, "overlap region", overlap_region) while cig[0][1] in "ID": #while the cigar starts on an insertion/deletion, (which doesn't make biological sense), remove it. if cig[0][1] == "D": mapping[7] += cig[0][0] mapping[10] -= cig[0][0] cig.popleft() elif cig[0][1] == "I": mapping[2] += cig[0][0] mapping[10] -= cig[0][0] cig.popleft() # if mapping[7] == 28596469 or mapping[7] == 17555049: # print("mapping of interest after final adjustment:", mapping) # print("cigar", cig) # adjust the "match count" field. mapping[9] = 0 for i in cig: if i[1] == "M": mapping[9] += i[0] adjusted_cigar = "" for i in cig: adjusted_cigar += str(i[0]) + i[1] mapping[cig_field] = "cg:Z:" + adjusted_cigar mapping = drop_unadjusted_fields(mapping) return mapping
set(ctrl_df_dict[samp].iloc[:, 1])) u2 = set.intersection(set(ctrl_df_dict[samp].iloc[:, 0]), set(ctrl_df_dict[samp].iloc[:, 2])) u3 = set.intersection(set(ctrl_df_dict[samp].iloc[:, 1]), set(ctrl_df_dict[samp].iloc[:, 2])) print("Number of Controls:", len(ctrl_df_dict[samp].columns)) u = u1 | u2 | u3 print("Length of Artifacts:", len(u1), len(u2), len(u3)) elif len(ctrl_df_dict[samp].columns) == 1: pos_list = [] changes = ctrl_df_dict[samp].iloc[:, 0].tolist() for cig in changes: CIGAR_edit = Cigar(cig) CIG_list = list(CIGAR_edit.items()) for index, tup in enumerate(CIG_list): ch = CIG_list[index][1] if ch == 'X': slice_l = CIG_list[:index + 1] pos = str(sum([t[0] for t in slice_l])) variant = slice_l[-1][1] pos_list.append(pos) else: pass artifact_pos = [
def map_pos(dna_pos, cigar_val, rna_query): """ Return genomic positon of a transcript position. Args: dna_pos: read mapping start position on a chromosome cigar_val: cigar string rna_query: transcript position Returns: Genomic positon """ #Split cigar using cigar module c = Cigar(cigar_val) c_split = list(c.items()) #Initiate variables rna_pos = 0 dna_pos = dna_pos #Initiate list rna_map = [] dna_map = [] #Using cigar string, build transcript to genomic position mapping table for i, (c_len, c_type) in enumerate(c_split): #Define action for each type of cigar string #Cigar type: match, mismatch if c_type == "M" or c_type == "=" or c_type == "X": rna_map = rna_map + list(range(rna_pos, rna_pos + c_len)) dna_map = dna_map + list(range(dna_pos, dna_pos + c_len)) rna_pos = rna_map[-1] + 1 dna_pos = dna_map[-1] + 1 #Cigar type: Soft clip elif c_type == "S": dna_pos = dna_pos - c_len rna_map = rna_map + list(range(rna_pos, rna_pos + c_len)) dna_map = dna_map + list(range(dna_pos, dna_pos + c_len)) rna_pos = rna_map[-1] + 1 dna_pos = dna_map[-1] + 1 #Cigar type: Hard clip elif c_type == "H": rna_pos = rna_pos dna_pos = dna_pos #Cigar type: deletion elif c_type == "D": rna_map = rna_map + [str(rna_pos) + 'D'] * c_len dna_map = dna_map + list(range(dna_pos, dna_pos + c_len)) dna_pos = dna_map[-1] + 1 #Cigar type: Skipped region in the read elif c_type == "N": rna_map = rna_map + [str(rna_pos) + 'N'] * c_len dna_map = dna_map + list(range(dna_pos, dna_pos + c_len)) dna_pos = dna_map[-1] + 1 #Cigar type: insertion in the read elif c_type == "I": rna_map = rna_map + list(range(rna_pos, rna_pos + c_len)) dna_map = dna_map + [str(dna_pos) + 'I'] * c_len rna_pos = rna_map[-1] + 1 #Cigar type: padding elif c_type == "P": rna_map = rna_map + list(range(rna_pos, rna_pos + c_len)) dna_map = dna_map + [str(dna_pos) + 'P'] * c_len rna_pos = rna_map[-1] + 1 #Convert list to data frame pos_map_df = pd.DataFrame(list(zip(rna_map, dna_map)), columns=['rna', 'dna']) #Get genomic position for transcript position query dna_val = pos_map_df[pos_map_df['rna'] == rna_query] #Return genomic position only return (dna_val['dna'].values[0])