def __init__(self, pairQ2, overlap_baseNumber=10): self._merged = [] self._fastQ1 = pairQ2[0] self._fastQ2 = translator.reverse_complement_FastQ( pairQ2[1]) # from now on, fastQ are sense strand self._overlapNumber = overlap_baseNumber self._assemble_seq = ""
def concentrate2single(infilepath,outputfilename): #also reverse translate #combinedfile_name = os.path.basename(infilepath) raw_single_fasta_name =outputfilename.rstrip('.fastq')+'.fasta' raw_single_fasta_file = open(raw_single_fasta_name, 'wb') all_fastq = {} count_file =0 #combinedfile_name =os.path.join(outputfilename,combinedfile_name+'.fastq') combinedfile_name = outputfilename print "concentrate2single name is" +combinedfile_name combinedfile = open(combinedfile_name , 'wb') print outputfilename for filename in os.listdir(infilepath): single_fastq = [] flag_reverse = False if filename.endswith('.fastq'): seq_name = filename.split(';')[0] with open(os.path.join(infilepath,filename)) as f: for row in f: row = row.strip('\n') if row.startswith("@"): fasta_output = '\n>'+row.lstrip('@')+'\n' # row = row.split(";") if "QB6179" in row or "QB5506" in row or 'QB6178' in row or 'Rev' in row or 'rev' in row: flag_reverse = True IDs=re.search('\D(\d{4})\D',row) ###### Extract 4 numbers , return ID=IDs.group(0) ##### #row= row[0]+"\n" #print ID row = "@"+ID #ID=str(ID) +'\n' combinedfile.write(row+'\n') single_fastq.append(row.lstrip('@')) single_fastq[-1] = covert_Qscore(single_fastq[-1]) fasta_output +=single_fastq[1] + '\n' raw_single_fasta_file.write(fasta_output) if flag_reverse == True: try: single_fastq =translator.reverse_complement_FastQ(single_fastq) except: print single_fastq all_fastq[ID] = all_fastq.get(ID, []) all_fastq[ID].append(single_fastq) f.close() combinedfile.close() print "All the fastqs were concentrated into a single file ------%s. And the ID was extracted." +combinedfile_name return all_fastq
def anneal_1direction(self, inputfastq1, inputfastq2): print "$$$$$$" print inputfastq1 print inputfastq2 firstSeq = inputfastq1[1] secondSeq = inputfastq2[1] fastQ1 = inputfastq1 fastQ2 = inputfastq2 print len(fastQ1[1]) print len(fastQ2[1]) if not firstSeq and not secondSeq: print " firstSeq and not secondSeq empty" return ' ' elif not firstSeq or len( firstSeq) <= self._overlapNumber or firstSeq in secondSeq: print "lif not firstSeq or len(firstSeq) <= self._overlapNumber" return secondSeq elif not secondSeq or len( secondSeq) <= self._overlapNumber or secondSeq in firstSeq: print "firtseq> 2" return firstSeq couple_list = [] print "-------- caculate assemb1--------" (overlap_seq1, assembleSeq1) = self.find_overlap(fastQ1, fastQ2) couple_list.append((overlap_seq1, assembleSeq1)) print "-------- caculate assemb2--------" (overlap_seq2, assembleSeq2) = self.find_overlap(fastQ2, fastQ1) couple_list.append((overlap_seq2, assembleSeq2)) print "-------- caculate assemb3--------" antiSense_fastQ2 = translator.reverse_complement_FastQ(inputfastq2) (overlap_seq3, assembleSeq3) = self.find_overlap(fastQ1, antiSense_fastQ2) couple_list.append((overlap_seq3, assembleSeq3)) longest_overlap = ('', '') print couple_list for tmp_couple in couple_list: if len(tmp_couple[0]) > len(longest_overlap): longest_overlap = tmp_couple assembled_fasta = longest_overlap[1] print "assembled_fasta: " + assembled_fasta self._assemble_seq = assembled_fasta return assembled_fasta
def anneal2fastq(self): #print "$$$$$$" couple_list = [] #for cut_off in [15, 20, 25, 30, 35, 40]: fastQ1 = self._fastQ1 fastQ2 = self._fastQ2 firstSeq = fastQ1[1] secondSeq = fastQ2[1] for cut_off in [15, 20, 25, 30, 35, 40]: object1 = TrimEnds.TrimEnds(fastQ1) object1.trimEnds(minimum_quality=cut_off) fastQ1 = object1.output_trimed_fastq() firstSeq = fastQ1[1] if not firstSeq and not secondSeq: print " firstSeq and not secondSeq empty" couple_list.append(('', '', 0)) continue elif not firstSeq or len( firstSeq) <= self._overlapNumber or firstSeq in secondSeq: print "lif not firstSeq or len(firstSeq) <= self._overlapNumber" couple_list.append((secondSeq, secondSeq, 0)) continue elif not secondSeq or len( secondSeq) <= self._overlapNumber or secondSeq in firstSeq: print "firtseq> 2" couple_list.append((firstSeq, firstSeq, 0)) continue #print "-------- caculate assemb1--------" (overlap_seq1, assembleSeq1, mismatch1) = self.find_overlap(fastQ1, fastQ2, cut_off) couple_list.append((overlap_seq1, assembleSeq1, mismatch1)) #print "-------- caculate assemb2--------" (overlap_seq2, assembleSeq2, mismatch2) = self.find_overlap(fastQ2, fastQ1, cut_off) couple_list.append((overlap_seq2, assembleSeq2, mismatch2)) #print "-------- caculate assemb3--------" antiSense_fastQ2 = translator.reverse_complement_FastQ(fastQ2) (overlap_seq3, assembleSeq3, mismatch3) = self.find_overlap(fastQ1, antiSense_fastQ2, cut_off) couple_list.append((overlap_seq3, assembleSeq3, mismatch3)) #print "-------- caculate assemb4 reverse both A and B--------" antiSense_fastQ1 = translator.reverse_complement_FastQ(fastQ1) (overlap_seq4, assembleSeq4, mismatch4) = self.find_overlap(antiSense_fastQ2, antiSense_fastQ1, cut_off) couple_list.append((overlap_seq4, assembleSeq4, mismatch4)) #print "-------- caculate assemb5 reverse both A and B--------" (overlap_seq5, assembleSeq5, mismatch5) = self.find_overlap(antiSense_fastQ1, antiSense_fastQ2, cut_off) couple_list.append((overlap_seq5, assembleSeq5, mismatch5)) longest_overlap = ('', '', 0) #print couple_list for tmp_couple in couple_list: if len(tmp_couple[0]) > 200 and len(tmp_couple[1]) > len( longest_overlap[1]): longest_overlap = tmp_couple ''' if longest_overlap[0] ==0: better_fastq =['','','',''] better_fastq = choose_one_read(self._fastQ1, self._fastQ2) if better_fastq: longest_overlap=('1',better_fastq[1],1) ''' # if len(assembled_fasta) > 100: # print "assembled_fasta: "+ assembled_fasta self._overlap_seq = longest_overlap[0] self._assemble_seq = longest_overlap[1] self._mismatch_count = longest_overlap[2] return
def anneal2fastq(self): #print "$$$$$$" couple_list = [] #for cut_off in [15, 20, 25, 30, 35, 40]: for cut_off in [20]: object1 = TrimEnds.TrimEnds(self._fastQ1) object1.trimEnds(minimum_quality=cut_off) fastQ1 = object1.output_trimed_fastq() firstSeq = fastQ1[1] object2 = TrimEnds.TrimEnds(self._fastQ2) object2.trimEnds(minimum_quality=cut_off) fastQ2 = object2.output_trimed_fastq() secondSeq = fastQ2[1] if not firstSeq and not secondSeq: print " firstSeq and not secondSeq empty" couple_list.append(('', '')) continue elif not firstSeq or len( firstSeq) <= self._overlapNumber or firstSeq in secondSeq: print "lif not firstSeq or len(firstSeq) <= self._overlapNumber" couple_list.append((secondSeq, secondSeq)) continue elif not secondSeq or len( secondSeq) <= self._overlapNumber or secondSeq in firstSeq: print "firtseq> 2" couple_list.append((firstSeq, firstSeq)) continue #print "-------- caculate assemb1--------" (overlap_seq1, assembleSeq1) = self.find_overlap(fastQ1, fastQ2) couple_list.append((overlap_seq1, assembleSeq1)) #print "-------- caculate assemb2--------" (overlap_seq2, assembleSeq2) = self.find_overlap(fastQ2, fastQ1) couple_list.append((overlap_seq2, assembleSeq2)) #print "-------- caculate assemb3--------" antiSense_fastQ2 = translator.reverse_complement_FastQ(fastQ2) (overlap_seq3, assembleSeq3) = self.find_overlap(fastQ1, antiSense_fastQ2) couple_list.append((overlap_seq3, assembleSeq3)) #print "-------- caculate assemb4 reverse both A and B--------" antiSense_fastQ1 = translator.reverse_complement_FastQ(fastQ1) (overlap_seq4, assembleSeq4) = self.find_overlap(antiSense_fastQ2, antiSense_fastQ1) couple_list.append((overlap_seq4, assembleSeq4)) #print "-------- caculate assemb5 reverse both A and B--------" (overlap_seq5, assembleSeq5) = self.find_overlap(antiSense_fastQ1, antiSense_fastQ2) couple_list.append((overlap_seq5, assembleSeq5)) longest_overlap = ('', '') print couple_list for tmp_couple in couple_list: if len(tmp_couple[0]) > 200 and len(tmp_couple[1]) > len( longest_overlap[1]): longest_overlap = tmp_couple assembled_fasta = longest_overlap[1] # if len(assembled_fasta) > 100: # print "assembled_fasta: "+ assembled_fasta self._assemble_seq = assembled_fasta return