def node_is_similar(seq1, seq2): l1 = len(seq1) l2 = len(seq2) if l1 == 0 or l2 == 0: return False if l1 <= 2 and l2 <= 2: return True if l1 < l2: l1, l2 = l2, l1 seq1, seq2 = seq2, seq1 # always make seq1 the longer one o1 = Aligner(seq1, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=False) # require the the whole (shorter) seq2 must be aligned # and set min score to approx 90% accuracy if EXPECTED_ERR_RATE == 0: res = o1.align(seq2, min_score=l2 * 2 * 1.0, min_len=l2 * 1.0) elif EXPECTED_ERR_RATE < 2: res = o1.align(seq2, min_score=int(l1 * 2 * .80), min_len=int(l2 * .9)) else: raise Exception, "Expected error rate not implemented for {0}% and above".format( EXPECTED_ERR_RATE) return res is not None
def node_is_similar(seq1, seq2): l1 = len(seq1) l2 = len(seq2) if l1 == 0 or l2 == 0: return False if l1 <= 2 and l2 <= 2: return True if l1 < l2: l1, l2 = l2, l1 seq1, seq2 = seq2, seq1 # always make seq1 the longer one o1 = Aligner(seq1, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=False) # require the the whole (shorter) seq2 must be aligned # and set min score to approx 90% accuracy res = o1.align(seq2, min_score=int(l1*2*.80), min_len=int(l2*.9)) return res is not None
def filter_reads(readfile): print("Filtering reads\n") ssw = Aligner(tn_seq) total=0 matched=0 with open(filtered_filename,'w') as f: for title, seq, qual in FastqGeneralIterator(open(readfile)): total+=1 res = ssw.align(seq,min_score, min_match_length) if res: end = res.query_end+1 if len(seq)-end >= min_remaining_length: matched+=1 f.write('@%s\n%s\n+\n%s\n' % (title, seq[end:], qual[end:])) print("%s of %s read had the tn seq\n" % (matched, total))
def test(): filter_quality = True min_qual = 25 trim_adapter = True adapter_path = "/home/adrien/Programming/Python/IsFinder/test/adapter.fa" R1 = "/home/adrien/Programming/Python/IsFinder/test/fastq/all_AAV_R1.fastq.gz" R2 = "/home/adrien/Programming/Python/IsFinder/test/fastq/all_AAV_R2.fastq.gz" # Define a quality filter object if filter_quality: q_filter = QualityFilter(min_qual) else: q_filter = None # Define a adapter trimmer object if trim_adapter: a = Aligner() trimmer = AdapterTrimmer(a, adapter_path) else: trimmer = None # Define the global fastq filter F = FastqFilterPP(R1, R2, quality_filter=q_filter, adapter_trimmer=trimmer, input_qual="fastq-sanger") print(repr(F)) print(repr(q_filter)) print(repr(a)) print(repr(trimmer))
def test2(): filter_quality = True min_qual = 25 trim_adapter = True adapter_path = "/home/adrien/Programming/Python/IsFinder/test/adapter.fa" R1_path = "/home/adrien/Programming/Python/IsFinder/test/fastq/all_AAV_R1.fastq.gz" R2_path = "/home/adrien/Programming/Python/IsFinder/test/fastq/all_AAV_R2.fastq.gz" # Define a quality filter object if filter_quality: q_filter = QualityFilter(min_qual) else: q_filter = None # Define a adapter trimmer object if trim_adapter: a = Aligner(report_cigar=True) trimmer = AdapterTrimmer(a, adapter_path) else: trimmer = None # Define the global fastq filter f_filter = FastqFilter(q_filter, trimmer) print(f_filter.filter(R1_path, R2_path)) print(repr(q_filter)) print(repr(a)) print(repr(trimmer))
def validate_reconstructed_seq(seq, orig): """ seq --- the sequence that is reconstructed orig --- the original sequence because the reconstructed seq can be longer, we don't care about deletions (deletions w.r.t could just be exon skipping or minor base errors) we only care that there is NOT a lot of insertions (which would indicate error in my bubble solution) """ o1 = Aligner(seq, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=True) l2 = len(orig) res = o1.align(orig, min_score=l2*2*.90, min_len=l2) for num, type in iter_cigar_string(res.cigar_string): if type == 'I' and num > 5: return False, res.cigar_string return True, res.cigar_string
def node_is_similar(seq1, seq2): l1 = len(seq1) l2 = len(seq2) if l1 == 0 or l2 == 0: return False if l1 <= 2 and l2 <= 2: return True if l1 < l2: l1, l2 = l2, l1 seq1, seq2 = seq2, seq1 # always make seq1 the longer one o1 = Aligner(seq1, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=False) # require the the whole (shorter) seq2 must be aligned # and set min score to approx 90% accuracy if EXPECTED_ERR_RATE == 0: res = os.align(seq2, min_score=l2*2*1.0, min_len=l2*1.0) elif EXPECTED_ERR_RATE < 2: res = o1.align(seq2, min_score=int(l1*2*.80), min_len=int(l2*.9)) else: raise Exception, "Expected error rate not implemented for {0}% and above".format(EXPECTED_ERR_RATE) return res is not None
def align (opt): print ("Inport subject sequence") # Import fasta subject if opt.subject.rpartition(".")[2].lower() == "gz": subject_handle = gzip.open(opt.subject, "r") else: subject_handle = open(opt.subject, "r") subject = SeqIO.read(subject_handle, "fasta") print ("Inport query sequences and count the number of sequences") # Import fasta subject if opt.query.rpartition(".")[2].lower() == "gz": nseq = count_seq(opt.query, opt.qtype, True) query_handle = gzip.open(opt.query, "r") else: nseq = count_seq(opt.query, opt.qtype, False) query_handle = open(opt.query, "r") query_gen = SeqIO.parse(query_handle, opt.qtype) print("{} contains {} sequences to align".format(opt.query, nseq)) # Calculate a step list for the progress bar nseq_list = [int(nseq*i/100.0) for i in range(5,101,5)] print ("Initialize ssw aligner with the subject sequence") # Init the an Aligner object with the reference value ssw = Aligner( str(subject.seq), match=int(opt.match), mismatch=int(opt.mismatch), gap_open=int(opt.gap_open), gap_extend= int(opt.gap_extend), report_secondary=False, report_cigar=True) # Write the header of the SAM file with open("result.sam", "w") as f: f.write("@HD\tVN:1.0\tSO:unsorted\n") f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq))) f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n") f.write("@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n".format( opt.match, opt.mismatch, opt.gap_open, opt.gap_extend)) f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format( opt.min_score, opt.min_len)) print ("Starting alignment of queries against the subject sequence") start = time() # Align each query along the subject an write result in a SAM file i = 0 for query in query_gen: # Find the best alignment if opt.reverse: al, orient = find_best_align (ssw, query, float(opt.min_score), int(opt.min_len)) else: al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True # If valid match found if al: f.write(sam_line( qname=query.id, flag=0 if orient else 16, rname=subject.id, pos=al.ref_begin+1, cigar=al.cigar_string, seq=str(query.seq), qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*", tags=["AS:i:{}".format(al.score)])) # If no valid match found and -u flag activated (report unaligned) elif opt.unaligned: f.write(sam_line( qname=query.id, flag=4, seq=str(query.seq), qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*")) # Else = match unreported # Progress bar i+=1 if i in nseq_list: frac = i/float(nseq) t = time()-start print ("{} sequences \t{}% \tRemaining time = {}s".format(i, int(frac*100), round(t/frac-t, 2))) print ("\n{} Sequences processed in {}s".format(i, round(time()-start, 2)))
def align(opt): print("Inport subject sequence") # Import fasta subject if opt.subject.rpartition(".")[2].lower() == "gz": subject_handle = gzip.open(opt.subject, "r") else: subject_handle = open(opt.subject, "r") subject = SeqIO.read(subject_handle, "fasta") print("Inport query sequences and count the number of sequences") # Import fasta subject if opt.query.rpartition(".")[2].lower() == "gz": nseq = count_seq(opt.query, opt.qtype, True) query_handle = gzip.open(opt.query, "r") else: nseq = count_seq(opt.query, opt.qtype, False) query_handle = open(opt.query, "r") query_gen = SeqIO.parse(query_handle, opt.qtype) print("{} contains {} sequences to align".format(opt.query, nseq)) # Calculate a step list for the progress bar nseq_list = [int(nseq * i / 100.0) for i in range(5, 101, 5)] print("Initialize ssw aligner with the subject sequence") # Init the an Aligner object with the reference value ssw = Aligner(str(subject.seq), match=int(opt.match), mismatch=int(opt.mismatch), gap_open=int(opt.gap_open), gap_extend=int(opt.gap_extend), report_secondary=False, report_cigar=True) # Write the header of the SAM file with open("result.sam", "w") as f: f.write("@HD\tVN:1.0\tSO:unsorted\n") f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq))) f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n") f.write( "@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n" .format(opt.match, opt.mismatch, opt.gap_open, opt.gap_extend)) f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format( opt.min_score, opt.min_len)) print("Starting alignment of queries against the subject sequence") start = time() # Align each query along the subject an write result in a SAM file i = 0 for query in query_gen: # Find the best alignment if opt.reverse: al, orient = find_best_align(ssw, query, float(opt.min_score), int(opt.min_len)) else: al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True # If valid match found if al: f.write( sam_line( qname=query.id, flag=0 if orient else 16, rname=subject.id, pos=al.ref_begin + 1, cigar=al.cigar_string, seq=str(query.seq), qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*", tags=["AS:i:{}".format(al.score)])) # If no valid match found and -u flag activated (report unaligned) elif opt.unaligned: f.write( sam_line( qname=query.id, flag=4, seq=str(query.seq), qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*")) # Else = match unreported # Progress bar i += 1 if i in nseq_list: frac = i / float(nseq) t = time() - start print("{} sequences \t{}% \tRemaining time = {}s".format( i, int(frac * 100), round(t / frac - t, 2))) print("\n{} Sequences processed in {}s".format( i, round(time() - start, 2)))
from ssw_wrap import Aligner polyA = Aligner("A"*200, match=5, mismatch=3, gap_open=4, gap_extend=1, report_secondary=False, report_cigar=True) query_seq = "CTACGTAGCTAGCTAGCTATGCTAGCTGATGCTAGCTGTGTAAAAAAAAAAAAAAGAAAAAATTTAAAAAAAACGTGCTAGCTGTGCTATTAGCTAGTCGTGGCTAGTGTAC" result = polyA.align(query_seq, min_score=20, min_len=20) begin = result.query_begin end = result.query_end print(" "*begin+query_seq[begin:end+1]+"\n"+query_seq)
from csv import DictReader from Bio import SeqIO from ssw_wrap import Aligner MIN_BARCODE_MATCH_LEN = 10 # have to at least see 10 of the 16 bp aligned MIN_BARCODE_MATCH_SCORE = 20 MIN_BARCODE_SCORE_LEAD = 10 barcodes = ['atgacgcatcgtctga', 'gcagagtcatgtatag', 'gagtgctactctagta', 'catgtactgatacaca'] for i in xrange(4): barcodes[i] = barcodes[i].upper() aligners = [Aligner(barcodes[i], match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=True) for i in xrange(4)] def main(ccs_fasta, flnc_fasta, primer_csv): good_flnc = [] d = {} reader = DictReader(open(primer_csv),delimiter=',') for r in reader: zmw = r['id'][:r['id'].rfind('/')] d[zmw] = r flog = open(flnc_fasta+'.filtered.log', 'w') flog.write("flnc_id\tthreelen\tthreeseq\tscore0\tscore1\tscore2\tscore3\n") for r in SeqIO.parse(open(ccs_fasta), 'fasta'): zmw = r.id[:r.id.rfind('/')] if zmw not in d: continue # discarded short sequences rec = d[zmw] cands = [] # list of (barcode i, score, end-start) if rec['threeseen']=='1' and rec['fiveseen']=='1' and rec['polyAseen']=='1' and rec['chimera']=='0': # is FLNC