def node_is_similar(seq1, seq2): l1 = len(seq1) l2 = len(seq2) if l1 == 0 or l2 == 0: return False if l1 <= 2 and l2 <= 2: return True if l1 < l2: l1, l2 = l2, l1 seq1, seq2 = seq2, seq1 # always make seq1 the longer one o1 = Aligner(seq1, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=False) # require the the whole (shorter) seq2 must be aligned # and set min score to approx 90% accuracy if EXPECTED_ERR_RATE == 0: res = o1.align(seq2, min_score=l2 * 2 * 1.0, min_len=l2 * 1.0) elif EXPECTED_ERR_RATE < 2: res = o1.align(seq2, min_score=int(l1 * 2 * .80), min_len=int(l2 * .9)) else: raise Exception, "Expected error rate not implemented for {0}% and above".format( EXPECTED_ERR_RATE) return res is not None
def node_is_similar(seq1, seq2): l1 = len(seq1) l2 = len(seq2) if l1 == 0 or l2 == 0: return False if l1 <= 2 and l2 <= 2: return True if l1 < l2: l1, l2 = l2, l1 seq1, seq2 = seq2, seq1 # always make seq1 the longer one o1 = Aligner(seq1, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=False) # require the the whole (shorter) seq2 must be aligned # and set min score to approx 90% accuracy res = o1.align(seq2, min_score=int(l1*2*.80), min_len=int(l2*.9)) return res is not None
def filter_reads(readfile): print("Filtering reads\n") ssw = Aligner(tn_seq) total=0 matched=0 with open(filtered_filename,'w') as f: for title, seq, qual in FastqGeneralIterator(open(readfile)): total+=1 res = ssw.align(seq,min_score, min_match_length) if res: end = res.query_end+1 if len(seq)-end >= min_remaining_length: matched+=1 f.write('@%s\n%s\n+\n%s\n' % (title, seq[end:], qual[end:])) print("%s of %s read had the tn seq\n" % (matched, total))
def validate_reconstructed_seq(seq, orig): """ seq --- the sequence that is reconstructed orig --- the original sequence because the reconstructed seq can be longer, we don't care about deletions (deletions w.r.t could just be exon skipping or minor base errors) we only care that there is NOT a lot of insertions (which would indicate error in my bubble solution) """ o1 = Aligner(seq, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=True) l2 = len(orig) res = o1.align(orig, min_score=l2*2*.90, min_len=l2) for num, type in iter_cigar_string(res.cigar_string): if type == 'I' and num > 5: return False, res.cigar_string return True, res.cigar_string
def node_is_similar(seq1, seq2): l1 = len(seq1) l2 = len(seq2) if l1 == 0 or l2 == 0: return False if l1 <= 2 and l2 <= 2: return True if l1 < l2: l1, l2 = l2, l1 seq1, seq2 = seq2, seq1 # always make seq1 the longer one o1 = Aligner(seq1, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=False) # require the the whole (shorter) seq2 must be aligned # and set min score to approx 90% accuracy if EXPECTED_ERR_RATE == 0: res = os.align(seq2, min_score=l2*2*1.0, min_len=l2*1.0) elif EXPECTED_ERR_RATE < 2: res = o1.align(seq2, min_score=int(l1*2*.80), min_len=int(l2*.9)) else: raise Exception, "Expected error rate not implemented for {0}% and above".format(EXPECTED_ERR_RATE) return res is not None
def align (opt): print ("Inport subject sequence") # Import fasta subject if opt.subject.rpartition(".")[2].lower() == "gz": subject_handle = gzip.open(opt.subject, "r") else: subject_handle = open(opt.subject, "r") subject = SeqIO.read(subject_handle, "fasta") print ("Inport query sequences and count the number of sequences") # Import fasta subject if opt.query.rpartition(".")[2].lower() == "gz": nseq = count_seq(opt.query, opt.qtype, True) query_handle = gzip.open(opt.query, "r") else: nseq = count_seq(opt.query, opt.qtype, False) query_handle = open(opt.query, "r") query_gen = SeqIO.parse(query_handle, opt.qtype) print("{} contains {} sequences to align".format(opt.query, nseq)) # Calculate a step list for the progress bar nseq_list = [int(nseq*i/100.0) for i in range(5,101,5)] print ("Initialize ssw aligner with the subject sequence") # Init the an Aligner object with the reference value ssw = Aligner( str(subject.seq), match=int(opt.match), mismatch=int(opt.mismatch), gap_open=int(opt.gap_open), gap_extend= int(opt.gap_extend), report_secondary=False, report_cigar=True) # Write the header of the SAM file with open("result.sam", "w") as f: f.write("@HD\tVN:1.0\tSO:unsorted\n") f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq))) f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n") f.write("@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n".format( opt.match, opt.mismatch, opt.gap_open, opt.gap_extend)) f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format( opt.min_score, opt.min_len)) print ("Starting alignment of queries against the subject sequence") start = time() # Align each query along the subject an write result in a SAM file i = 0 for query in query_gen: # Find the best alignment if opt.reverse: al, orient = find_best_align (ssw, query, float(opt.min_score), int(opt.min_len)) else: al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True # If valid match found if al: f.write(sam_line( qname=query.id, flag=0 if orient else 16, rname=subject.id, pos=al.ref_begin+1, cigar=al.cigar_string, seq=str(query.seq), qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*", tags=["AS:i:{}".format(al.score)])) # If no valid match found and -u flag activated (report unaligned) elif opt.unaligned: f.write(sam_line( qname=query.id, flag=4, seq=str(query.seq), qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*")) # Else = match unreported # Progress bar i+=1 if i in nseq_list: frac = i/float(nseq) t = time()-start print ("{} sequences \t{}% \tRemaining time = {}s".format(i, int(frac*100), round(t/frac-t, 2))) print ("\n{} Sequences processed in {}s".format(i, round(time()-start, 2)))
def align(opt): print("Inport subject sequence") # Import fasta subject if opt.subject.rpartition(".")[2].lower() == "gz": subject_handle = gzip.open(opt.subject, "r") else: subject_handle = open(opt.subject, "r") subject = SeqIO.read(subject_handle, "fasta") print("Inport query sequences and count the number of sequences") # Import fasta subject if opt.query.rpartition(".")[2].lower() == "gz": nseq = count_seq(opt.query, opt.qtype, True) query_handle = gzip.open(opt.query, "r") else: nseq = count_seq(opt.query, opt.qtype, False) query_handle = open(opt.query, "r") query_gen = SeqIO.parse(query_handle, opt.qtype) print("{} contains {} sequences to align".format(opt.query, nseq)) # Calculate a step list for the progress bar nseq_list = [int(nseq * i / 100.0) for i in range(5, 101, 5)] print("Initialize ssw aligner with the subject sequence") # Init the an Aligner object with the reference value ssw = Aligner(str(subject.seq), match=int(opt.match), mismatch=int(opt.mismatch), gap_open=int(opt.gap_open), gap_extend=int(opt.gap_extend), report_secondary=False, report_cigar=True) # Write the header of the SAM file with open("result.sam", "w") as f: f.write("@HD\tVN:1.0\tSO:unsorted\n") f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq))) f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n") f.write( "@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n" .format(opt.match, opt.mismatch, opt.gap_open, opt.gap_extend)) f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format( opt.min_score, opt.min_len)) print("Starting alignment of queries against the subject sequence") start = time() # Align each query along the subject an write result in a SAM file i = 0 for query in query_gen: # Find the best alignment if opt.reverse: al, orient = find_best_align(ssw, query, float(opt.min_score), int(opt.min_len)) else: al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True # If valid match found if al: f.write( sam_line( qname=query.id, flag=0 if orient else 16, rname=subject.id, pos=al.ref_begin + 1, cigar=al.cigar_string, seq=str(query.seq), qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*", tags=["AS:i:{}".format(al.score)])) # If no valid match found and -u flag activated (report unaligned) elif opt.unaligned: f.write( sam_line( qname=query.id, flag=4, seq=str(query.seq), qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*")) # Else = match unreported # Progress bar i += 1 if i in nseq_list: frac = i / float(nseq) t = time() - start print("{} sequences \t{}% \tRemaining time = {}s".format( i, int(frac * 100), round(t / frac - t, 2))) print("\n{} Sequences processed in {}s".format( i, round(time() - start, 2)))
from ssw_wrap import Aligner polyA = Aligner("A"*200, match=5, mismatch=3, gap_open=4, gap_extend=1, report_secondary=False, report_cigar=True) query_seq = "CTACGTAGCTAGCTAGCTATGCTAGCTGATGCTAGCTGTGTAAAAAAAAAAAAAAGAAAAAATTTAAAAAAAACGTGCTAGCTGTGCTATTAGCTAGTCGTGGCTAGTGTAC" result = polyA.align(query_seq, min_score=20, min_len=20) begin = result.query_begin end = result.query_end print(" "*begin+query_seq[begin:end+1]+"\n"+query_seq)