def get_alternative_alignments(in_sam_line): f = in_sam_line.rstrip().split("\t") if len(f) <= 12: return [] # move on if theres no optional tags enstring = "\t".join(f[x] for x in range(11, len(f))) m = re.search('XA:Z:(\S+)', enstring) if not m: return [] # move on if theres no SA:Z tag secondary_alignments = m.group(1) aligns = secondary_alignments.split(';') bwalike = re.compile('^([^,]+),(\d+),([+-]),([^,]+),(\d+),(\d+)$') otherlike = re.compile('^([^,]+),([+-])(\d+),([^,]+),(\d+),(\d+)$') otherlike2 = re.compile('^([^,]+),([+-])(\d+),([^,]+),(\d+)$') output = [] for align in aligns: if align == '': continue # I guess you can have empty segments and we should ignore them m1 = bwalike.match(align) m2 = otherlike.match(align) m3 = otherlike2.match(align) if m1: chr = m1.group(1) pos = m1.group(2) strand = m1.group(3) cigar = m1.group(4) mapQ = m1.group(5) nm = m1.group(6) elif m2: chr = m2.group(1) pos = m2.group(3) strand = m2.group(2) cigar = m2.group(4) mapQ = m2.group(5) nm = m2.group(6) elif m3: chr = m3.group(1) pos = m3.group(3) strand = m3.group(2) cigar = m3.group(4) mapQ = m3.group(5) nm = 0 else: sys.stderr.write("WARNING: unable to parse secondary alignment\n" + align + "\n") sys.exit() flag = '0' seq = f[9] phred = f[10] if strand == '-': flag = '16' seq = SequenceBasics.rc(seq) phred = phred[::-1] samline= f[0]+"\t"+flag+"\t"+chr+"\t"+pos+"\t"+mapQ+"\t"+cigar+"\t"\ + "*\t0\t0\t*\t*" output.append(samline) return output
def get_alternative_alignments(in_sam_line): f = in_sam_line.rstrip().split("\t") if len(f) <= 12: return [] # move on if theres no optional tags enstring = "\t".join(f[x] for x in range(11,len(f))) m = re.search('XA:Z:(\S+)',enstring) if not m: return [] # move on if theres no SA:Z tag secondary_alignments = m.group(1) aligns = secondary_alignments.split(';') bwalike = re.compile('^([^,]+),(\d+),([+-]),([^,]+),(\d+),(\d+)$') otherlike = re.compile('^([^,]+),([+-])(\d+),([^,]+),(\d+),(\d+)$') otherlike2 = re.compile('^([^,]+),([+-])(\d+),([^,]+),(\d+)$') output = [] for align in aligns: if align == '': continue # I guess you can have empty segments and we should ignore them m1 = bwalike.match(align) m2 = otherlike.match(align) m3 = otherlike2.match(align) if m1: chr = m1.group(1) pos = m1.group(2) strand = m1.group(3) cigar = m1.group(4) mapQ = m1.group(5) nm = m1.group(6) elif m2: chr = m2.group(1) pos = m2.group(3) strand = m2.group(2) cigar = m2.group(4) mapQ = m2.group(5) nm = m2.group(6) elif m3: chr = m3.group(1) pos = m3.group(3) strand = m3.group(2) cigar = m3.group(4) mapQ = m3.group(5) nm = 0 else: sys.stderr.write("WARNING: unable to parse secondary alignment\n"+align+"\n") sys.exit() flag = '0' seq = f[9] phred = f[10] if strand == '-': flag = '16' seq = SequenceBasics.rc(seq) phred = phred[::-1] samline= f[0]+"\t"+flag+"\t"+chr+"\t"+pos+"\t"+mapQ+"\t"+cigar+"\t"\ + "*\t0\t0\t*\t*" output.append(samline) return output
def add_genepred_line(self,inline): if not self.ref_hash: sys.stderr.write("ERROR: Must assign a reference genome dictionary first\n") sys.exit() gpd = GenePredBasics.GenePredEntry(inline) if gpd.value('name') in self.transcripts: sys.stderr.write("WARNING: "+inline+" transcript was already set\n") seq = '' for i in range(0,gpd.value('exonCount')): seq += self.ref_hash[gpd.value('chrom')][gpd.value('exonStarts')[i]:gpd.value('exonEnds')[i]].upper() if gpd.value('strand') == '-': seq = SequenceBasics.rc(seq) self.transcripts[gpd.value('name')] = seq return
def write_genepred_to_fasta(gpd_filename,ref_fasta,out_fasta): ofile = open(out_fasta,'w') ref = SequenceBasics.read_fasta_into_hash(ref_fasta) with open(gpd_filename) as f: for line in f: if re.match('^#',line): continue d = line_to_entry(line) if d['chrom'] in ref: seq = '' for i in range(0,d['exonCount']): seq = seq+ref[d['chrom']][d['exonStarts'][i]:d['exonEnds'][i]] if d['strand'] == '-': seq = SequenceBasics.rc(seq) ofile.write(">"+str(d['name'])+"\n"+seq.upper()+"\n") ofile.close()
def add_genepred_line(self, inline): if not self.ref_hash: sys.stderr.write( "ERROR: Must assign a reference genome dictionary first\n") sys.exit() gpd = GenePredBasics.GenePredEntry(inline) if gpd.value('name') in self.transcripts: sys.stderr.write("WARNING: " + inline + " transcript was already set\n") seq = '' for i in range(0, gpd.value('exonCount')): seq += self.ref_hash[gpd.value('chrom')][ gpd.value('exonStarts')[i]:gpd.value('exonEnds')[i]].upper() if gpd.value('strand') == '-': seq = SequenceBasics.rc(seq) self.transcripts[gpd.value('name')] = seq return
def emit_paired_short_read(self,read_length): [name,seq] = self.emit() # Get the sequence name first flipped_seq = random_flip(seq) # Use fragmentation if its enabled frag_seq = flipped_seq if self.gaussian_fragmentation: frag_len = max(self.gaussian_fragmentation['minimum'],int(random.gauss(self.gaussian_fragmentation['mu'],self.gaussian_fragmentation['sigma']))) if frag_len == 0: return [name, 'N'*read_length, 'N'*read_length] frag_seq = random_fragment(flipped_seq,frag_len) l1 = frag_seq[0:read_length] if len(l1) < read_length: l1 = l1 + 'N'*(read_length-len(l1)) rc_frag_seq = SequenceBasics.rc(frag_seq) r1 = rc_frag_seq[0:read_length] if len(r1) < read_length: r1 = r1 + 'N'*(read_length-len(r1)) return [name,l1,r1]
def get_sequence(self,ref_fasta_hash): seq = ''.join([\ ref_fasta_hash[self.value('chrom')][self.value('exonStarts')[i]:self.value('exonEnds')[i]]\ for i in range(0,self.value('exonCount'))]) if self.value('strand') == '-': return SequenceBasics.rc(seq).upper() return seq.upper()
def random_flip(seq): if random.random() < 0.5: return seq return SequenceBasics.rc(seq)
def convert_line(self,psl_line,query_sequence=None,quality_sequence=None): try: pe = PSLBasics.line_to_entry(psl_line) except: sys.stderr.write("Problem parsing line:\n"+psl_line.rstrip()+"\n") return False if len(pe['tStarts']) != len(pe['blockSizes']): sys.stderr.write("Warning invalid psl entry: "+pe['qName']+"\n") return False #work on the positive strand case first cigar = '*' blocks = len(pe['blockSizes']) starts = pe['qStarts'] #if pe['strand'] == '-': # starts = [x for x in reversed(pe['qStarts_actual'])] # print 'isrev' q_coord_start = starts[0]+1 # base-1 converted starting position q_coord_end = starts[blocks-1]+pe['blockSizes'][blocks-1] # base-1 position t_coord_start = pe['tStarts'][0]+1 # base-1 converted starting position t_coord_end = pe['tStarts'][blocks-1]+pe['blockSizes'][blocks-1] # base-1 position if pe['qName'] not in self.reads and self.reads_set is True: sys.stderr.write("Warning: qName "+pe['qName']+" was not found in reads\n") # we will clip the query sequence to begin and end from the aligned region #q_seq = '' #if self.reads_set: # q_seq = self.reads[pe['qName']] # 1. Get the new query to output q_seq_trimmed = '*' if self.reads_set or query_sequence: q_seq_trimmed = query_sequence if not query_sequence: # get it from the archive we loaded if we didn't give it q_seq_trimmed = self.reads[pe['qName']] if pe['strand'] == '-': q_seq_trimmed = SequenceBasics.rc(q_seq_trimmed) q_seq_trimmed = q_seq_trimmed[q_coord_start-1:q_coord_end] qual_trimmed = '*' if self.qualities_set or quality_sequence: qual_trimmed = quality_sequence if not quality_sequence: qual_trimmed = self.qualities[pe['qName']] if pe['strand'] == '-': qual_trimmed = qual_trimmed[::-1] qual_trimmed = qual_trimmed[q_coord_start-1:q_coord_end] # 2. Get the cigar string to output prev_diff = t_coord_start-q_coord_start cigar = '' #for i in range(0,blocks): # current_diff = pe['tStarts'][i]-starts[i] # delta = current_diff - prev_diff # #print delta # if delta >= self.min_intron_size: # cigar += str(abs(delta))+'N' # elif delta > 0: # we have a # cigar += str(abs(delta))+'D' # elif delta < 0: # we have a # cigar += str(abs(delta))+'I' # cigar += str(pe['blockSizes'][i])+'M' # our matches # #print current_diff # prev_diff = current_diff qstarts = [x-pe['qStarts'][0] for x in pe['qStarts']] tstarts = [x-pe['tStarts'][0] for x in pe['tStarts']] query_index = 0 target_index = 0 junctions = [] for i in range(0,blocks): qdif = qstarts[i] - query_index tdif = tstarts[i] - target_index if qdif > 0: # we have to insert cigar += str(qdif) + 'I' if tdif > self.min_intron_size: # we have an intron cigar += str(tdif) + 'N' junctions.append(i) elif tdif > 0: # we have to delete cigar += str(tdif) + 'D' cigar += str(pe['blockSizes'][i]) + 'M' query_index = qstarts[i]+pe['blockSizes'][i] target_index = tstarts[i]+pe['blockSizes'][i] ### cigar done # inspect junctions if we have a ref_genome spliceflag_set = False if self.ref_genome_set: canon = 0 revcanon = 0 for i in junctions: #blocks following a junction left_num = pe['tStarts'][i-1]+pe['blockSizes'][i-1] left_val = self.ref_genome[pe['tName']][left_num:left_num+2].upper() right_num = pe['tStarts'][i-1]-2 right_val = self.ref_genome[pe['tName']][right_num:right_num+2].upper() junc = left_val + '-' + right_val if junc in self.canonical: canon += 1 if junc in self.revcanonical: revcanon += 1 if canon > revcanon: spliceflag_set = True spliceflag = '+' elif revcanon > canon: spliceflag_set = True spliceflag = '-' # if we have junctions, and we should be setting direction but # we can't figure out the direction skip ambiguous direction if len(junctions) > 0 and self.skip_directionless_splice and spliceflag_set == False: return False samline = pe['qName'] + "\t" # 1. QNAME if pe['strand'] == '-': samline += '16' + "\t" # 2. FLAG else: samline += '0' + "\t" samline += pe['tName'] + "\t" # 3. RNAME samline += str(t_coord_start) + "\t" # 4. POS samline += '0' + "\t" # 5. MAPQ samline += cigar + "\t" # 6. CIGAR samline += '*' + "\t" # 7. RNEXT samline += '0' + "\t" # 8. PNEXT samline += '0' + "\t" # 9. TLEN samline += q_seq_trimmed + "\t" # 10. SEQ samline += qual_trimmed + "\t" # 11. QUAL if spliceflag_set: samline += 'XS:A:'+spliceflag + "\t" if self.ref_genome_set: samline += 'NH:i:'+str(self.mapping_counts[pe['qName']]) + "\t" samline += 'XC:i:'+str(len(junctions)) + "\t" samline += 'NM:i:0' return samline
def convert_line(self, psl_line, query_sequence=None, quality_sequence=None): try: pe = PSLBasics.line_to_entry(psl_line) except: sys.stderr.write("Problem parsing line:\n" + psl_line.rstrip() + "\n") return False if len(pe['tStarts']) != len(pe['blockSizes']): sys.stderr.write("Warning invalid psl entry: " + pe['qName'] + "\n") return False #work on the positive strand case first cigar = '*' blocks = len(pe['blockSizes']) starts = pe['qStarts'] #if pe['strand'] == '-': # starts = [x for x in reversed(pe['qStarts_actual'])] # print 'isrev' q_coord_start = starts[0] + 1 # base-1 converted starting position q_coord_end = starts[blocks - 1] + pe['blockSizes'][blocks - 1] # base-1 position t_coord_start = pe['tStarts'][ 0] + 1 # base-1 converted starting position t_coord_end = pe['tStarts'][blocks - 1] + pe['blockSizes'][blocks - 1] # base-1 position if pe['qName'] not in self.reads and self.reads_set is True: sys.stderr.write("Warning: qName " + pe['qName'] + " was not found in reads\n") # we will clip the query sequence to begin and end from the aligned region #q_seq = '' #if self.reads_set: # q_seq = self.reads[pe['qName']] # 1. Get the new query to output q_seq_trimmed = '*' if self.reads_set or query_sequence: q_seq_trimmed = query_sequence if not query_sequence: # get it from the archive we loaded if we didn't give it q_seq_trimmed = self.reads[pe['qName']] if pe['strand'] == '-': q_seq_trimmed = SequenceBasics.rc(q_seq_trimmed) q_seq_trimmed = q_seq_trimmed[q_coord_start - 1:q_coord_end] qual_trimmed = '*' if self.qualities_set or quality_sequence: qual_trimmed = quality_sequence if not quality_sequence: qual_trimmed = self.qualities[pe['qName']] if pe['strand'] == '-': qual_trimmed = qual_trimmed[::-1] qual_trimmed = qual_trimmed[q_coord_start - 1:q_coord_end] # 2. Get the cigar string to output prev_diff = t_coord_start - q_coord_start cigar = '' #for i in range(0,blocks): # current_diff = pe['tStarts'][i]-starts[i] # delta = current_diff - prev_diff # #print delta # if delta >= self.min_intron_size: # cigar += str(abs(delta))+'N' # elif delta > 0: # we have a # cigar += str(abs(delta))+'D' # elif delta < 0: # we have a # cigar += str(abs(delta))+'I' # cigar += str(pe['blockSizes'][i])+'M' # our matches # #print current_diff # prev_diff = current_diff qstarts = [x - pe['qStarts'][0] for x in pe['qStarts']] tstarts = [x - pe['tStarts'][0] for x in pe['tStarts']] query_index = 0 target_index = 0 junctions = [] for i in range(0, blocks): qdif = qstarts[i] - query_index tdif = tstarts[i] - target_index if qdif > 0: # we have to insert cigar += str(qdif) + 'I' if tdif > self.min_intron_size: # we have an intron cigar += str(tdif) + 'N' junctions.append(i) elif tdif > 0: # we have to delete cigar += str(tdif) + 'D' cigar += str(pe['blockSizes'][i]) + 'M' query_index = qstarts[i] + pe['blockSizes'][i] target_index = tstarts[i] + pe['blockSizes'][i] ### cigar done # inspect junctions if we have a ref_genome spliceflag_set = False if self.ref_genome_set: canon = 0 revcanon = 0 for i in junctions: #blocks following a junction left_num = pe['tStarts'][i - 1] + pe['blockSizes'][i - 1] left_val = self.ref_genome[pe['tName']][left_num:left_num + 2].upper() right_num = pe['tStarts'][i - 1] - 2 right_val = self.ref_genome[pe['tName']][right_num:right_num + 2].upper() junc = left_val + '-' + right_val if junc in self.canonical: canon += 1 if junc in self.revcanonical: revcanon += 1 if canon > revcanon: spliceflag_set = True spliceflag = '+' elif revcanon > canon: spliceflag_set = True spliceflag = '-' # if we have junctions, and we should be setting direction but # we can't figure out the direction skip ambiguous direction if len( junctions ) > 0 and self.skip_directionless_splice and spliceflag_set == False: return False samline = pe['qName'] + "\t" # 1. QNAME if pe['strand'] == '-': samline += '16' + "\t" # 2. FLAG else: samline += '0' + "\t" samline += pe['tName'] + "\t" # 3. RNAME samline += str(t_coord_start) + "\t" # 4. POS samline += '0' + "\t" # 5. MAPQ samline += cigar + "\t" # 6. CIGAR samline += '*' + "\t" # 7. RNEXT samline += '0' + "\t" # 8. PNEXT samline += '0' + "\t" # 9. TLEN samline += q_seq_trimmed + "\t" # 10. SEQ samline += qual_trimmed + "\t" # 11. QUAL if spliceflag_set: samline += 'XS:A:' + spliceflag + "\t" if self.ref_genome_set: samline += 'NH:i:' + str(self.mapping_counts[pe['qName']]) + "\t" samline += 'XC:i:' + str(len(junctions)) + "\t" samline += 'NM:i:0' return samline