def get_position_in_ref(self, pos, ref): around_seq = self.seq[pos - 50:pos + 50] around_qual = self.quality[pos - 50:pos + 50] with open('temp_files/temp_fastq.fastq', 'w') as fw: fw.write('@temp\n' + around_seq + '\n+\n' + around_qual) subprocess.run('bwa mem -M -x ont2d -t 7 ' + ref + ' temp_files/temp_fastq.fastq > temp_files/' 'temp.sam', shell=True, stdout=FNULL, stderr=subprocess.STDOUT) with open('temp_files/temp.sam') as f: row = f.readlines()[2].strip().split() if row[2] == '*': return None cigar = row[5] ref_pos = int(row[3]) c = Cigar(cigar) split_cigar = '' for i in c.items(): split_cigar += i[0] * i[1] shift = 0 current = 0 for l in split_cigar: current += 1 if l == 'I': shift -= 1 elif l == 'D': shift += 1 current -= 1 if current == 50: ref_coordinate = ref_pos + 49 + shift break return ref_coordinate
def getGeneLocation(hit, gene): #parse CIGAR string loc = {} loc['note'] = '' list_hit = [] total_len = 0 for h in hit: hit = {} cigar = Cigar(h['cigar']) items = list(cigar.items()) if items[0][1] == 'S' and items[-1][1] == 'S': hit['seq'] = h['seq'][items[0][0]:-items[-1][0]] elif items[0][1] == 'H' and items[-1][1] == 'H': hit['seq'] = h['seq'] else: hit['seq'] = h['seq'] send = int(h['ss']) for item in items: if item[1] == 'M': send = send + int(item[0]) if item[1] == 'D': send = send + int(item[0]) if item[1] == 'I': send = send - int(item[0]) hit['pos'] = int(items[0][0]) hit['ss'] = int(h['ss']) hit['send'] = send list_hit.append(hit) list_hit_sorted = sorted(list_hit, key=lambda k: k['ss']) if not len(list_hit_sorted) > 0: loc['consensus'] = '' loc['hit'] = [] loc['note'] = 'Not found' return loc scafold = [list_hit_sorted[0]] cover_len = 0 for i in range(len(list_hit_sorted)): if list_hit_sorted[i]['ss'] > scafold[-1]['ss'] + len( scafold[-1]['seq']): scafold.append(list_hit_sorted[i]) if len(scafold) < len(list_hit_sorted): loc['note'] = "Multiple sequences found" loc['hit'] = list_hit_sorted loc['pos'] = list_hit_sorted[0]['pos'] for i in range(len(scafold)): cover_len = cover_len + len(scafold[i]['seq']) loc['consensus'] = makeConsensus(scafold, gene) loc['coverage'] = cover_len / len(gene) #make consensus sequence from scafold #print(loc) return loc
def alignment_length_cigar(CIGAR): ''' Compute alignment on the reference length from CIGAR string Input: 1. CIGAR: CIGAR string Output: 1. alignmentLen: alignment on the reference length ''' ## 1. Read CIGAR string using proper module cigarTuples = Cigar(CIGAR) ## 2. Iterate over the operations and compute the alignment length alignmentLen = 0 for cigarTuple in list(cigarTuples.items()): length = int(cigarTuple[0]) operation = cigarTuple[1] ### Update reference alignment length ## a) Operations consuming query and reference # - Op M, tag 0, alignment match (can be a sequence match or mismatch) # - Op =, tag 7, sequence match # - Op X, tag 8, sequence mismatch if (operation == 'M') or (operation == '=') or (operation == 'X'): alignmentLen += length ## b) Operations only consuming reference # - Op D, tag 2, deletion from the reference # - Op N, tag 3, skipped region from the reference elif (operation == 'D') or (operation == 'N'): alignmentLen += length return alignmentLen
def parse_cigar(aln, qryseq, refseq, cutoff=500): from cigar import Cigar aln = Cigar(aln) lq, rq = 0, 0 lr, rr = 0, 0 refs = [] qrys = [] blks = [] R, Q = {}, {} blkseq = "" blkpos = 0 refmap = [(rr, blkpos - rr)] qrymap = [(rq, blkpos - rq)] def push(qval=None, rval=None): nonlocal R, Q, blkseq, blkpos, refmap, qrymap assert not (qval is None and rval is None) def f(xs, x): if x is None: xs.append(None) return True else: l, r = zip(x) if l < r: xs.append(x) return True return False hasq = f(qrys, qval) hasr = f(refs, rval) if hasq or hasr: assert len(qrys) == len(refs) assert len(blkseq) > 0, "empty seq" blks.append((np.array(list(blkseq)), (Q, np.array(qrymap).T), (R, np.array(refmap).T))) R, Q = {}, {} blkseq = "" blkpos = 0 refmap = [(rr, blkpos - rr)] qrymap = [(rq, blkpos - rq)] def recordbp(): nonlocal blkpos, refmap, qrymap blkpos = len(blkseq) refmap.append((rr, blkpos - rr)) qrymap.append((rq, blkpos - rq)) for l, t in aln.items(): if t in ['S', 'H']: if l >= cutoff: print(aln) import ipdb ipdb.set_trace() push((lq, rq), (lr, rr)) blkseq = qryseq[rq:rq + l] # TODO: Think through soft/hard clips # if t == 'S': rq += l recordbp() push((rq - l, rq), None) lq = rq lr = rr else: rq += l recordbp() elif t == 'M': rs = np.array(list(refseq[rr:rr + l])) qs = np.array(list(qryseq[rq:rq + l])) diff = np.where(np.array(rs != qs))[0] for i in diff: Q[i + blkpos] = qs[i] blkseq += refseq[rr:rr + l] rq += l rr += l recordbp() elif t == 'D': if l >= cutoff: push((lq, rq), (lr, rr)) blkseq = refseq[rr:rr + l] rr += l recordbp() push(None, (rr - l, rr)) lr = rr lq = rq else: for i in range(l): Q[i + blkpos] = '-' blkseq += refseq[rr:rr + l] rr += l recordbp() elif t == 'I': if l >= cutoff: push((lq, rq), (lr, rr)) blkseq = qryseq[rq:rq + l] rq += l recordbp() push((rq - l, rq), None) lq = rq lr = rr else: for i in range(l): R[i + blkpos] = '-' blkseq += qryseq[rq:rq + l] rq += l recordbp() push((lq, rq), (lr, rr)) assert len(qrys) == len(refs) and len(qrys) == len(blks) return qrys, refs, blks
dicIdtoCigar2[readId][0]) [txId1, start1, end1] = dicReadIdPos1[readId][0] [txId2, start2, end2] = dicReadIdPos2[readId][0] gene1, gene2 = dicIdGeneName[txId1], dicIdGeneName[txId2] type1, type2 = dicIdGeneType[txId1], dicIdGeneType[txId2] targetFile1.write( '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (readId, gene1, gene2, txId1, start1, end1, str(cigar1), type1, geneList1, txId2, start2, end2, str(cigar2), type2, geneList2)) if len(set(geneList1) & set(geneList2)) == 0: if type1 == 'mRNA' and type2 == 'mRNA': #check cigar string cigar1, cigar2 = Cigar(dicIdtoCigar1[readId][0]), Cigar( dicIdtoCigar2[readId][0]) cigar1List = list(cigar1.items()) cigar2List = list(cigar2.items()) flag1 = False flag2 = False totalLength1 = float(sum([x[0] for x in cigar1List])) totalLength2 = float(sum([x[0] for x in cigar2List])) if cigar1List[0][ 1] == 'M' and cigar1List[0][0] / totalLength1 >= 0.5: flag1 = True if (cigar1List[0][1] == 'S' or cigar1List[0][1] == 'H') and cigar1List[0][0] / totalLength1 <= 0.2: if cigar1List[1][ 1] == 'M' and cigar1List[1][0] / totalLength1 >= 0.5: flag1 = True if cigar2List[0][ 1] == 'M' and cigar2List[0][0] / totalLength2 >= 0.5:
def alignment_interval_query(CIGAR, orientation): ''' Compute alignment on the reference length from CIGAR string Input: 1. CIGAR: CIGAR string 2. orientation: alignment orientation (+ or -) Output: 1. beg: begin position in query 2. end: end position in query ''' ## 1. Read CIGAR string using proper module cigar = Cigar(CIGAR) ## 2. Iterate over the operations and compute query alignment length and start position in query alignmentLen = 0 counter = 0 # Count operations for cigarTuple in list(cigar.items()): length = int(cigarTuple[0]) operation = cigarTuple[1] ## Set start position in query based on first operation if counter == 0: # a) Soft or Hard clipping if (operation == 'S') or (operation == 'H'): startPos = length # b) No clipping else: startPos = 0 #### Update query alignment length # - Op M, alignment match (can be a sequence match or mismatch) # - Op =, sequence match # - Op X, sequence mismatch # - Op I, insertion to the reference if (operation == 'M') or (operation == '=') or (operation == 'X') or (operation == 'I'): alignmentLen += length ## Update operations counter counter += 1 ## 3. Compute alignment interval in raw query ## Compute read length readLen = len(cigar) # a) Query aligned in + if orientation == '+': beg = startPos end = startPos + alignmentLen # b) Query aligned in - (reversed complemented to align) else: beg = readLen - startPos - alignmentLen end = readLen - startPos return beg, end
def map_pos(dna_pos, cigar_val, rna_query): """ Return genomic positon of a transcript position. Args: dna_pos: read mapping start position on a chromosome cigar_val: cigar string rna_query: transcript position Returns: Genomic positon """ #Split cigar using cigar module c = Cigar(cigar_val) c_split = list(c.items()) #Initiate variables rna_pos = 0 dna_pos = dna_pos #Initiate list rna_map = [] dna_map = [] #Using cigar string, build transcript to genomic position mapping table for i, (c_len, c_type) in enumerate(c_split): #Define action for each type of cigar string #Cigar type: match, mismatch if c_type == "M" or c_type == "=" or c_type == "X": rna_map = rna_map + list(range(rna_pos, rna_pos + c_len)) dna_map = dna_map + list(range(dna_pos, dna_pos + c_len)) rna_pos = rna_map[-1] + 1 dna_pos = dna_map[-1] + 1 #Cigar type: Soft clip elif c_type == "S": dna_pos = dna_pos - c_len rna_map = rna_map + list(range(rna_pos, rna_pos + c_len)) dna_map = dna_map + list(range(dna_pos, dna_pos + c_len)) rna_pos = rna_map[-1] + 1 dna_pos = dna_map[-1] + 1 #Cigar type: Hard clip elif c_type == "H": rna_pos = rna_pos dna_pos = dna_pos #Cigar type: deletion elif c_type == "D": rna_map = rna_map + [str(rna_pos) + 'D'] * c_len dna_map = dna_map + list(range(dna_pos, dna_pos + c_len)) dna_pos = dna_map[-1] + 1 #Cigar type: Skipped region in the read elif c_type == "N": rna_map = rna_map + [str(rna_pos) + 'N'] * c_len dna_map = dna_map + list(range(dna_pos, dna_pos + c_len)) dna_pos = dna_map[-1] + 1 #Cigar type: insertion in the read elif c_type == "I": rna_map = rna_map + list(range(rna_pos, rna_pos + c_len)) dna_map = dna_map + [str(dna_pos) + 'I'] * c_len rna_pos = rna_map[-1] + 1 #Cigar type: padding elif c_type == "P": rna_map = rna_map + list(range(rna_pos, rna_pos + c_len)) dna_map = dna_map + [str(dna_pos) + 'P'] * c_len rna_pos = rna_map[-1] + 1 #Convert list to data frame pos_map_df = pd.DataFrame(list(zip(rna_map, dna_map)), columns=['rna', 'dna']) #Get genomic position for transcript position query dna_val = pos_map_df[pos_map_df['rna'] == rna_query] #Return genomic position only return (dna_val['dna'].values[0])
filter_df['read_len'] = len_list filter_df['start_coo'] = ref_st_list filter_df['loc'] = filter_df.index filter_df = filter_df.sort_values(['Score'], ascending=[False]) filter_df = filter_df.drop_duplicates(subset=['Read_Name'], keep='first') CIG_dict = dict(zip(filter_df['CIGAR_list'], filter_df['Read_Name'])) COO_dict = dict(zip(filter_df['CIGAR_list'], filter_df['start_coo'])) read_len_dict = dict(zip(filter_df['CIGAR_list'], filter_df['read_len'])) for c_str in filter_df['CIGAR_list']: CIGAR_edit = Cigar(c_str) l = len(CIGAR_edit) CIG_list = list(CIGAR_edit.items()) for index, tup in enumerate(CIG_list): if CIG_list[0][1] is 'S': CIG_list.remove(CIG_list[0]) else: pass ch = CIG_list[index][1] if ch in String_list: slice_l = CIG_list[:index + 1] slice_m = CIG_list[:index] pos2 = str(sum([t[0] for t in slice_m if t[1] != 'I']) + 1)
l = line.split("\t") readID = l[0] chrName = l[2] filterDict[readID] = [] filterDict[readID].append(chrName) filterDict[readID] = set(filterDict[readID]) samFile.close() samFile2 = open("QH046cDNA.sam", "r") for line2 in samFile2: if line2.startswith("@"): print line2 else: l2 = line2.split("\t") readID2 = l[0] if len(filterDict[readID2]) > 2: continue else: seqLength = len(l2[9]) cigar = Cigar(l2[5]) cigarList = list(cigar.items()) mapp = 0 for i in cigarList: if i[1] == "M": mapp += i[0] else: continue if seqLength - mapp <= 1: print line2 samFile2.close()
def main(): parser = argparse.ArgumentParser( description= 'Parse BAM file for multi-alignment and soft-clipped reads. Will return a BAM file containing split reads and a BEDPE file containing the coordinates of the split reads (useful for circos plots, etc.). Can additionally return a filtered methylation TSV file if one is provided (optional)' ) required = parser.add_argument_group( 'Required', 'Bam, clip size to filter on, output location, flags to filter on, splits to filter on, and True/False if alternative chromosomes were used' ) required.add_argument('-b', '--bam', type=str, help='bam file - must be created with NGMLR') required.add_argument('-c', '--clip_size_thresh', type=int, help='soft clip size threshold to filter on [1000]', default=1000) required.add_argument('-o', '--output', type=str, help='output location and prefix') required.add_argument( '-f', '--flag', type=str, help= 'flag(s) to filter bam file on. delimited list, default 256,2046,2304', default='256,2048,2304') required.add_argument( '-s', '--splits', type=int, help= 'Number of splits read aligns to filter on for bedpe file (2 only option right now, hope to change in the future)[2]', default=2) required.add_argument( '-a', '--alt_chroms', type=bool, help= 'Does BAM file use alternative chromosome names? (i.e. NC_000001.11, etc.) [False]', default=False) optional = parser.add_argument_group( 'Optional', 'methylation call tsv file (from f5c)') optional.add_argument( '-m', '--meth', type=str, help='Methylation calls tsv file to filter (from f5c)') args = parser.parse_args() ''' 1. Read in BAM file & extract read ID, flag, and CIGAR string, then filter based on flags ''' inbam = pysam.AlignmentFile(args.bam, "rb") reads = dict() codes = [int(item) for item in args.flag.split(',')] #print(codes) for read in inbam: if read.flag in codes: reads[read.query_name] = read.cigarstring #print(reads) #return(reads) ''' 2. Parsing CIGAR string for left and right soft-clipping ''' clips = defaultdict(dict) for key, value in reads.items(): #print(value) c = Cigar(value) items = list(c.items()) #print(items[-1][1]) if (items[0][1] == "S"): clips[key]["LC"] = int(items[0][0]) else: clips[key]["LC"] = 0 if (items[-1][1] == "S"): clips[key]["RC"] = int(items[-1][0]) else: clips[key]["RC"] = 0 #print(clips) ''' 3. Converting clips nested dict into pd dataframe, filtering on clipping criteria ''' clips_df = pd.DataFrame.from_dict(clips, orient='index') #print(clips_df.head()) clips_df = clips_df[(clips_df['LC'] >= args.clip_size_thresh) | (clips_df['RC'] >= args.clip_size_thresh)] #print(clips_df.head()) ''' 4. Extracting read id's from list above, and creating new BAM file ''' big_clip = list(clips_df.index) outfile = pysam.AlignmentFile(args.output + '_clipped.bam', 'w', template=inbam) inbam = pysam.AlignmentFile( args.bam, "rb") #Always need to re-load bam file for some reason for read in inbam: if read.query_name in big_clip: #print(read) outfile.write(read) ''' 5. Creating bedpe file BEDPE format: chrom1, start1, end1, chrom2, start2, end2 Right now this will only return reads that map to 2 places in the genome, in the future having it enabled for multi-mapping reads would be preferable ''' test_list = [] inbam = pysam.AlignmentFile(args.bam, "rb") for read in inbam: if read.query_name in big_clip: test_list.append(read.query_name) counts = Counter(test_list) unique_reads = [] N = args.splits #Right now this will always be 2, but in the future I'd like to make the program able to identify multi-mapping reads for key, value in counts.items(): #print(key, value) if value == N: unique_reads.append(key) inbam = pysam.AlignmentFile(args.bam, "rb") #print(len(unique_reads)) splits = defaultdict(dict) for read in inbam: if read.query_name in unique_reads: #print(read.query_name) if read.query_name not in splits.keys(): splits[read.query_name]["chromosome"] = read.reference_name splits[read.query_name]["start"] = str(read.reference_start) splits[read.query_name]["end"] = str(read.reference_end) else: splits[ read.query_name]["chromosome"] += "," + read.reference_name splits[read.query_name]["start"] += "," + str( read.reference_start) splits[read.query_name]["end"] += "," + str(read.reference_end) #print(splits) #print(splits) bedpe = pd.DataFrame.from_dict(splits, orient='index') #print(bedpe) bedpe[['chrom1', 'chrom2']] = bedpe['chromosome'].str.split( ',', expand=True, ) bedpe[['start1', 'start2']] = bedpe['start'].str.split( ',', expand=True, ) bedpe[['end1', 'end2']] = bedpe['end'].str.split( ',', expand=True, ) bedpe = bedpe[["chrom1", "start1", "end1", "chrom2", "start2", "end2"]] chr_dict = { "NC_000001.11": "chr1", "NC_000002.12": "chr2", "NC_000003.12": "chr3", "NC_000004.12": "chr4", "NC_000005.10": "chr5", "NC_000006.12": "chr6", "NC_000007.14": "chr7", "NC_000008.11": "chr8", "NC_000009.12": "chr9", "NC_000010.11": "chr10", "NC_000011.10": "chr11", "NC_000012.12": "chr12", "NC_000013.11": "chr13", "NC_000014.9": "chr14", "NC_000015.10": "chr15", "NC_000016.10": "chr16", "NC_000017.11": "chr17", "NC_000018.10": "chr18", "NC_000019.10": "chr19", "NC_000020.11": "chr20", "NC_000021.9": "chr21", "NC_000022.11": "chr22", "NC_000023.11": "chrX", "NC_000024.10": "chrY" } if args.alt_chroms == True: bedpe['chrom1'] = bedpe['chrom1'].map(chr_dict) bedpe['chrom2'] = bedpe['chrom2'].map(chr_dict) #print(bedpe.head()) bedpe.to_csv(args.output + "_split_reads.bedpe", index=False, sep='\t') ''' 6. Optional Methylation filtering ''' if args.meth is not None: meth = pd.read_csv(args.meth, sep='\t') meth = meth[meth.read_name.isin(big_clip)] meth.to_csv(args.output + '_clipped_meth.tsv', index=False, sep='\t')