def run_mBP_mBPN_pair(f1_in, f1_out, f2_in, f2_out, min_bp_qual_in_read, min_av_read_qual, min_bp_qual_or_N): iter1 = FastqGeneralIterator(f1_in) iter2 = FastqGeneralIterator(f2_in) for (idLine, seqLine, qualLine) in iter1: (idLine2, seqLine2, qualLine2) = next(iter2) npQualLine = numpy.fromstring( qualLine, dtype=numpy.uint8) - 33 #assume illumina 1.7 npQualLine2 = numpy.fromstring( qualLine2, dtype=numpy.uint8) - 33 #assume illumina 1.7 min = numpy.min(npQualLine) min2 = numpy.min(npQualLine2) if min >= min_bp_qual_in_read and min2 >= min_bp_qual_in_read: npSeqLine = numpy.fromstring(seqLine, 'c') npSeqLine[npQualLine < min_bp_qual_or_N] = 'N' f1_out.write( "@%s\n%s\n%s\n%s\n" % (idLine, npSeqLine.tostring().decode('utf-8'), "+", qualLine)) npSeqLine2 = numpy.fromstring(seqLine2, 'c') npSeqLine2[npQualLine2 < min_bp_qual_or_N] = 'N' f2_out.write("@%s\n%s\n%s\n%s\n" % (idLine2, npSeqLine2.tostring().decode('utf-8'), "+", qualLine2))
def fastqtrimmer(threeprimetrim, forreads, revreads): #Maybe you want to trim the reads from the 3' end before giving them to salmon. #Trim <threeprimetrim> nt from the 3' end of the read threeprimetrim = int(threeprimetrim) counter = 0 foutfilename = 'tempf.fastq' routfilename = 'tempr.fastq' with gzip.open(forreads, 'rb') as forinfh, gzip.open( revreads, 'rb') as revinfh, open(foutfilename, 'w') as foroutfh, open(routfilename, 'w') as revoutfh: try: for title, seq, qual in FastqGeneralIterator(forinfh): counter += 1 if counter % 1000000 == 0: print 'On read {0} of {1}.'.format(counter, forreads) foroutfh.write('@{0}\n{1}\n+\n{2}\n'.format( title, seq[:threeprimetrim * -1], qual[:threeprimetrim * -1])) except ValueError: pass try: for title, seq, qual in FastqGeneralIterator(revinfh): counter += 1 if counter % 1000000 == 0: print 'On read {0} of {1}.'.format(counter, forreads) revoutfh.write('@{0}\n{1}\n+\n{2}\n'.format( title, seq[:threeprimetrim * -1], qual[:threeprimetrim * -1])) except ValueError: pass print 'Done trimming {0} and {1}.'.format(forreads, revreads)
def FastMaxEEFilter(input, trunclen, maxee, output): from Bio.SeqIO.QualityIO import FastqGeneralIterator with open(output, 'w') as out: with open(input, 'rU') as file: for title, seq, qual in FastqGeneralIterator(file): trunclen = int(trunclen) Seq = seq[:trunclen] Qual = qual[:trunclen] ee = 0 for bp, Q in enumerate(Qual): q = int(ASCII.get(Q)) P = 10**(float(-q) / 10) ee += P if ee <= float(maxee): out.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual))
def _fastq_generic(in_handle, out_handle, mapping): """FASTQ helper function where can't have data loss by truncation (PRIVATE).""" from Bio.SeqIO.QualityIO import FastqGeneralIterator # For real speed, don't even make SeqRecord and Seq objects! count = 0 null = chr(0) for title, seq, old_qual in FastqGeneralIterator(in_handle): count += 1 # map the qual... qual = old_qual.translate(mapping) if null in qual: raise ValueError("Invalid character in quality string") out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) return count
def read_fastq(fname): """Provide read info from fastq file, potentially not existing. """ if not fname: for info in itertools.repeat(("", None, None)): yield info if os.path.splitext(fname)[1] == ".gz": open_file = gzip.open else: open_file = open with open_file(fname) as in_handle: for info in FastqGeneralIterator(in_handle): yield info
def _fastq_convert_tab(in_handle, out_handle, alphabet=None): """Fast FASTQ to simple tabbed conversion (PRIVATE). Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and Seq objects in order to speed up this conversion. NOTE - This does NOT check the characters used in the FASTQ quality string are valid! """ from Bio.SeqIO.QualityIO import FastqGeneralIterator #For real speed, don't even make SeqRecord and Seq objects! count = 0 for title, seq, qual in FastqGeneralIterator(in_handle): count += 1 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq)) return count
def filter_my_fastq_file (in_fastq, trim_len, out_fastq): """function to parse fastq files and trim off a set lenght (trim_len) Writes to a new fq file""" # creat a new fastq file to write to out_file = open(out_fastq, "w") # open the fastq file in_file = open(in_fastq) # iterate through the fastq file for title, seq, qual in FastqGeneralIterator(in_file): out_file.write("@%s\n%s\n+\n%s\n" % (title, seq[trim_len:], qual[trim_len:])) out_file.close() in_file.close()
def RandomReadIndexes(indexFile, indexFileRand, probability): # print(" Random read indexFile: '{}' ...".format(os.path.basename(indexFile))) records = GetTotalSeqRecords(indexFile) bar = progressbar.ProgressBar( maxval=records, widgets=[progressbar.Bar(left='<', marker='.', right='>')]).start() t = 0 randomSeq = [random.random() < probability for x in xrange(records)] with open(indexFileRand, "w") as handle: for title, seq, qual in FastqGeneralIterator(nopen(indexFile)): if randomSeq[t]: handle.write("@{}\n{}\n+\n{}\n".format(title, seq, qual)) bar.update(t) t += 1 bar.finish()
def load_fastq(fastq_file: str, pair: Optional[str] = None) -> Tuple[fastq.Read]: """Load a FASTQ file""" logging.info("Reading in FASTQ file %s", fastq_file) read_start = time.time() # type: float reads = {} # type: Dict[str, fastq.Read] if os.path.splitext(fastq_file)[-1] == '.gz': my_open = gzip.open # type: function else: my_open = open # type: function with my_open(fastq_file, 'rt') as ffile: for read in FastqGeneralIterator(ffile): # type: Tuple[str, str, str] name, seq, qual = read # type: str, str, str reads[name] = fastq.Read(read_id=name, seq=seq, qual=qual) logging.debug("Reading in FASTQ file %s took %s seconds", fastq_file, round(time.time() - read_start, 3)) if pair: logging.info("Reading in reverse FASTQ file %s", pair) reverse_start = time.time() # type: float if os.path.splitext(pair)[-1] == '.gz': my_open = gzip.open # type: function else: my_open = open # type: function with my_open(pair, 'rt') as rfile: for rread in FastqGeneralIterator( rfile): # type: Tuple[str, str, str] rname, rseq, rqual = rread # type: str, str, str try: reads[rname].add_reverse(seq=rseq, qual=rqual) except KeyError: logging.error( "Reverse read %s doesn't match any reads in the forward FASTQ file", rname) logging.debug("Reading in reverse FASTQ file took %s seconds", round(time.time() - reverse_start, 3)) return tuple(reads.values())
def splitDemux2(input, outputdir): for title, seq, qual in FastqGeneralIterator(open(input)): sample = title.split('barcodelabel=')[1] sample = sample.replace(';', '') if not args.length: with open(os.path.join(outputdir, sample + '.fastq'), 'ab') as output: output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) else: if len(seq) >= int(args.length): with open(os.path.join(outputdir, sample + '.fastq'), 'ab') as output: output.write("@%s\n%s\n+\n%s\n" % (title, seq[:int(args.length):], qual[:int(args.length)]))
def main(basename, input_dir, output_dir): input_dir = abspath(expanduser(input_dir)) output_dir = abspath(expanduser(output_dir)) output_r1 = path.join(output_dir, basename + "_R1.fastq.gz") output_r2 = path.join(output_dir, basename + "_R2.fastq.gz") r1_file = glob(input_dir + '/*{}*R1*.fastq.gz'.format(basename)) r2_file = glob(input_dir + '/*{}*R2*.fastq.gz'.format(basename)) r3_file = glob(input_dir + '/*{}*R3*.fastq.gz'.format(basename)) if (len(r1_file) != 1): raise Exception("More than 1 R1 file found") if (len(r2_file) != 1): raise Exception("More than 1 R2 file found") if (len(r3_file) != 1): raise Exception("More than 1 R3 file found") with gzip.open(r2_file[0], 'rt') as barcode_handle: with gzip.open(r3_file[0], 'rt') as umi_handle: with gzip.open(output_r1, 'wt') as out_handle: for barcode_record, umi_record in zip( FastqGeneralIterator(barcode_handle), FastqGeneralIterator(umi_handle)): assert barcode_record[0].split()[0] == \ umi_record[0].split()[0], "record titles match" seq_string = barcode_record[ 1][:CONFIG['barcode-length']] + umi_record[ 1][:CONFIG['umi-length']] qual_string = barcode_record[ 2][:CONFIG['barcode-length']] + umi_record[ 2][:CONFIG['umi-length']] out_handle.write("@" + barcode_record[0] + "\n") out_handle.write(seq_string + "\n") out_handle.write("+\n") out_handle.write(qual_string + "\n") copyfile(r1_file[0], output_r2)
def filter_reads(readfile): print("Filtering reads\n") ssw = Aligner(tn_seq) total=0 matched=0 with open(filtered_filename,'w') as f: for title, seq, qual in FastqGeneralIterator(open(readfile)): total+=1 res = ssw.align(seq,min_score, min_match_length) if res: end = res.query_end+1 if len(seq)-end >= min_remaining_length: matched+=1 f.write('@%s\n%s\n+\n%s\n' % (title, seq[end:], qual[end:])) print("%s of %s read had the tn seq\n" % (matched, total))
def run_mBP_mRQ_mBPN(f1_in, f1_out, min_bp_qual_in_read, min_av_read_qual, min_bp_qual_or_N): iter1 = FastqGeneralIterator(f1_in) for (idLine, seqLine, qualLine) in iter1: npQualLine = numpy.fromstring( qualLine, dtype=numpy.uint8) - 33 #assume illumina 1.7 min = numpy.min(npQualLine) if min >= min_bp_qual_in_read: mean = numpy.mean(npQualLine) if mean >= min_av_read_qual: npSeqLine = numpy.fromstring(seqLine, 'c') npSeqLine[npQualLine < min_bp_qual_or_N] = 'N' f1_out.write("@%s\n%s\n%s\n%s\n" % (idLine, npSeqLine.tostring().decode('utf-8'), "+", qualLine))
def split_barcode(t): barcode, excludebarcode, outdir, inputfile = t trim = len(barcode.seq) outfastq = op.join(outdir, "{0}.{1}.fastq".format(barcode.id, barcode.seq)) fp = must_open(inputfile) fw = open(outfastq, "w") for title, seq, qual in FastqGeneralIterator(fp): if not is_barcode_sample(seq, barcode, excludebarcode, trim): continue print("@{0}\n{1}\n+\n{2}".format(title, seq[trim:], qual[trim:]), file=fw) fw.close()
def analysis1(): seq1 = SeqIO.read("/data/compbio2/linyi/emx1.fa", "fasta") scores = [] #records in the fastq file is 1385196 scores = np.zeros(1385196) count = 0 with open(reads_fa_file) as in_handle: for title, seq, qual in FastqGeneralIterator(in_handle): alignments = pairwise2.align.globalds(seq1.seq, seq, blosum62, -10, -0.5) scores[count] = alignments[0][2] count += 1 return scores
def main(): #To parse command line usage = "usage: %prog [options]" p = optparse.OptionParser(usage) p.add_option('-i', '--input', help='Input fastq [None,REQD]') p.add_option('-s', '--size', type="int", default=1000, help="Minimum size of read to keep [1000]") p.add_option('-o', '--output', help='Output fastq [None,REQD]') opts, args = p.parse_args() with open(opts.input, "r") as fin: with open(opts.output, "w") as fout: for record in FastqGeneralIterator(fin): if len(record[1]) >= opts.size: fout.write("@%s\n%s\n+\n%s\n" % (record[0], record[1], record[2]))
def filter_my_fastq_file(in_fastq, number_of_seq, out_fastq): #open the fastq file number_of_seq = int(number_of_seq) in_file = open(in_fastq) #creat a new fastq file to write to out_file = open(out_fastq, "w") # enumerate is a way of counting i # iterate through the fastq file for i, (title, seq, qual) in enumerate(FastqGeneralIterator(in_file)): #python magic to identify every number_of_seq "loop" if i % number_of_seq == 0: #write this to a file out_file.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) out_file.close() in_file.close() return True
def get_info(handle,barcode=None,UMI=None): read_flag=0;total_data=0;total_q20=0;total_q30=0 barcode_q20=0;barcode_q30=0;UMI_q20=0;UMI_q30=0 remain_q20=0;remain_q30=0 for title, seq, qual in FastqGeneralIterator(handle): read_flag+=1 qual_num=[(ord(i)-33) for i in qual] raw_len=len(qual_num) total_data+=raw_len q20_len=get_qual_num(qual_num,20) q30_len=get_qual_num(qual_num,30) total_q20+=q20_len total_q30+=q30_len if barcode and UMI: barcode_q20_len=get_qual_num(qual_num,20,barcode) UMI_q20_len=get_qual_num(qual_num,20,UMI) barcode_q30_len=get_qual_num(qual_num,30,barcode) UMI_q30_len=get_qual_num(qual_num,30,UMI) max_index=max(barcode + UMI) remain_q20_len=get_qual_num(qual_num,20,[max_index]) remain_q30_len=get_qual_num(qual_num,30,[max_index]) barcode_q20+=barcode_q20_len;UMI_q20+=UMI_q20_len;barcode_q30+=barcode_q30_len;UMI_q30+=UMI_q30_len remain_q20+=remain_q20_len;remain_q30+=remain_q30_len elif barcode: barcode_q20_len=get_qual_num(qual_num,20,barcode) barcode_q30_len=get_qual_num(qual_num,30,barcode) max_index=max(barcode) remain_q20_len=get_qual_num(qual_num,20,[max_index]) remain_q30_len=get_qual_num(qual_num,30,[max_index]) barcode_q20+=barcode_q20_len;barcode_q30+=barcode_q30_len remain_q20+=remain_q20_len;remain_q30+=remain_q30_len elif UMI: UMI_q20_len=get_qual_num(qual_num,20,UMI) UMI_q30_len=get_qual_num(qual_num,30,UMI) max_index=max(UMI) remain_q20_len=get_qual_num(qual_num,20,[max_index]) remain_q30_len=get_qual_num(qual_num,30,[max_index]) UMI_q20+=UMI_q20_len;UMI_q30+=UMI_q30_len remain_q20+=remain_q20_len;remain_q30+=remain_q30_len if barcode and UMI: return read_flag,total_data,total_q20,total_q30,barcode_q20,barcode_q30,UMI_q20,UMI_q30,remain_q20,remain_q30 elif barcode: return read_flag,total_data,total_q20,total_q30,barcode_q20,barcode_q30,remain_q20,remain_q30 elif UMI: return read_flag,total_data,total_q20,total_q30,UMI_q20,UMI_q30,remain_q20,remain_q30 else: return read_flag,total_data,total_q20,total_q30
def adjust_quality_distr(list_AQD_path_input_files, int_AQD_phred_cutoff, int_AQD_input_max_low_q_percent, int_AQD_max_read_filter_percent, int_AQD_read_chunks, int_AQD_number_processes): list_AQD_quality_distr = [0 for _ in range(101)] with contextlib.ExitStack() as stack: list_AQD_input_files = [stack.enter_context(open(str_AQD_temp_path_input_file, 'r')) for str_AQD_temp_path_input_file \ in list_AQD_path_input_files] it_AQD_input_fastq = fastq_iterator(fastq_zip_equal(*[FastqGeneralIterator(obj_AQD_temp_input_file) \ for obj_AQD_temp_input_file in list_AQD_input_files])) for list_AQD_reads_for_processing in iter_double_chunked( it_AQD_input_fastq, int_AQD_read_chunks, int_AQD_number_processes): with concurrent.futures.ProcessPoolExecutor() as executor: list_AQD_read_chunk_distr = executor.map( quality_read_chunk, list_AQD_reads_for_processing, itertools.repeat(int_AQD_phred_cutoff)) for list_AQD_temp_read_chunk_distr in list_AQD_read_chunk_distr: list_AQD_quality_distr = [ int_AQD_temp_this_score + int_AQD_temp_total for int_AQD_temp_this_score, int_AQD_temp_total in zip( list_AQD_temp_read_chunk_distr, list_AQD_quality_distr) ] int_AQD_total_reads = sum(list_AQD_quality_distr) int_AQD_reads_pot_filter = 0 for int_AQD_i in range(101): int_AQD_reads_pot_filter += list_AQD_quality_distr[int_AQD_i] int_AQD_min_low_q = int_AQD_i if division_zero_tolerant( int_AQD_reads_pot_filter, int_AQD_total_reads ) * 100 >= 100 - int_AQD_max_read_filter_percent: break if int_AQD_min_low_q > int_AQD_input_max_low_q_percent: print('Adjusted filter criterion to ' + str(int_AQD_min_low_q) + '%') return int_AQD_min_low_q else: return int_AQD_input_max_low_q_percent
def write_output(fastx, ftype, comp_vectors, lengths_d, target_range): if ftype == "fastq": for read_num, (read_id, seq, qual) in enumerate(FastqGeneralIterator(open(fastx))): status = print_comp_vectors(read_num, target_range, comp_vectors, read_id, lengths_d) if status == "over": break elif ftype == "fasta": for read_num, (read_id, seq) in enumerate(SimpleFastaParser(open(fastx))): status = print_comp_vectors(read_num, target_range, comp_vectors, read_id, lengths_d) if status == "over": break
def splitread(args): """ %prog splitread fastqfile Split fastqfile into two read fastqfiles, cut in the middle. """ p = OptionParser(splitread.__doc__) p.add_option("-n", dest="n", default=76, type="int", help="Split at N-th base position [default: %default]") p.add_option("--rc", default=False, action="store_true", help="Reverse complement second read [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pairsfastq, = args base = op.basename(pairsfastq).split(".")[0] fq1 = base + ".1.fastq" fq2 = base + ".2.fastq" fw1 = must_open(fq1, "w") fw2 = must_open(fq2, "w") fp = must_open(pairsfastq) n = opts.n for name, seq, qual in FastqGeneralIterator(fp): name = "@" + name rec1 = FastqLite(name, seq[:n], qual[:n]) rec2 = FastqLite(name, seq[n:], qual[n:]) if opts.rc: rec2.rc() print >> fw1, rec1 print >> fw2, rec2 logging.debug("Reads split into `{0},{1}`".format(fq1, fq2)) fw1.close() fw2.close()
def fastqtrimmer(trim, outfile, infile): trim = int(trim) counter = 0 handle = open(outfile, "w") try: for title, seq, qual in FastqGeneralIterator(open(infile)): counter += 1 if counter % 1000000 == 0: print 'On read {0}'.format(counter) if len(seq) >= trim and len(seq) == len(qual): handle.write("@%s\n%s\n+\n%s\n" % (title, seq[:trim], qual[:trim])) except ValueError: print 'Title and second title line don\'t match for read {0}.'.format( title) handle.close()
def _trim_by_read(in_handles, to_trim, min_length): """Lazy generator for trimmed reads for all input files. """ iterators = [(f, FastqGeneralIterator(h)) for f, h in in_handles.iteritems()] f1, x1 = iterators[0] for name, seq, qual in x1: out = {} tseq, tqual = _trim_quality(seq, qual, to_trim, min_length) if tseq: out[f1] = (name, tseq, tqual) for f2, x2 in iterators[1:]: name, seq, qual = x2.next() tseq, tqual = _trim_quality(seq, qual, to_trim, min_length) if tseq: out[f2] = (name, tseq, tqual) if len(out) == len(iterators): yield out
def gc_content(fastx_fn, ftype): gc = [] if ftype=="fastq": for read_id, seq, qual in FastqGeneralIterator(open(fastx_fn)): seq_l = list(seq) read_gc = (seq_l.count("G") + seq_l.count("C")) / float(len(seq_l)) # read_gc = 0.5 gc.append( (read_id.split(" ")[0], read_gc) ) elif ftype=="fasta": for read_id, seq in SimpleFastaParser(open(fastx_fn)): seq_l = list(seq) read_gc = (seq_l.count("G") + seq_l.count("C")) / float(len(seq_l)) # read_gc = 0.5 gc.append( (read_id.split(" ")[0], read_gc) ) gc_df = pd.DataFrame(gc, columns=["read","GC"]) return gc_df
def convert_illumina_oldstyle(in_file): """Convert older Illumina barcoding conventions to current usage. """ to_remove = ["s_", "_sequence"] out_file = in_file for rem in to_remove: out_file = out_file.replace(rem, "") assert out_file != in_file if not os.path.exists(out_file): with open(in_file) as in_handle: with open(out_file, "w") as out_handle: for name, seq, qual in FastqGeneralIterator(in_handle): bc = name.split("#")[1].split("/")[0] seq += bc qual += qual[0] * len(bc) out_handle.write("@%s\n%s\n+\n%s\n" % (name, seq, qual)) return out_file
def main(fqInputName, outputName, minScore): with gzip.open(outputName, 'wb') as outHandle: with gzip.open(fqInputName, 'rt') as inHandle: inCounter = 0 outCounter = 0 for title, seq, qual in FastqGeneralIterator(inHandle): inCounter += 1 if checkHighQual(qual, minScore=minScore): outEntry = '@' + title + '\n' + seq + '\n+\n' + qual + '\n' outHandle.write(outEntry.encode()) outCounter += 1 print(','.join([ fqInputName, str(inCounter), outputName, str(outCounter), format(outCounter / inCounter, '.8f') ]))
def primerStrip(file, GoodOut, BadOut, fwdprimer, revprimer): PL = len(fwdprimer) with open(GoodOut, 'w') as good: with open(BadOut, 'w') as bad: for title, seq, qual in FastqGeneralIterator(open(file)): Diffs = primer.MatchPrefix(seq, fwdprimer) if Diffs <= args.primer_mismatch: Seq = seq[PL:] Qual = qual[PL:] if revprimer:#now need to look for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, revcomp_lib.RevComp(revprimer), args.primer_mismatch) if BestPosRev > 0: #reverse primer was found Seq = Seq[:BestPosRev] Qual = Qual[:BestPosRev] good.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual)) else: bad.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
def find_percentage(filename): f = open("results.txt", "w") fname = filename resultlist = [] seqcounterlist = [] seqlist = [] seqcounter = 1 with open(fname) as handle: for (title, sequence, quality) in FastqGeneralIterator(handle): # print(sequence) counter = 0 for char in sequence: if (char == "C" or char == "G"): counter = counter + 1 result = round(counter / len(sequence), 3) f.write(str(seqcounter) + " ") seqcounterlist.append(seqcounter) seqcounter = seqcounter + 1 resultlist.append(result) seqlist.append(sequence) f.write(str(result)) f.write("\n") f.write(str(sequence)) f.write("\n") # x axis values # corresponding y axis values plt.hist(resultlist, bins=75) plt.xticks(np.arange(0, 1, 0.05)) # plotting the points # naming the x axis plt.xlabel('G/C dažnis') # naming the y axis plt.ylabel('seku skaičius') # giving a title to my graph plt.title('Grafikas') # function to show the plot plt.show()
def complete_blocks(args, blocks, fastq_pair): """ Organise reads into blocks.""" for block in blocks: if len(blocks[block]) > 2: blocks[block][3].clear() sample = os.path.basename(fastq_pair[0]).split('_') if len(sample) > 1: sample = '_'.join(sample[:3]) logging.info("Processing sample {}".format(sample)) else: exit('Cannot deduce sample name from fastq filename {}'.\ format(fastq_pair[0])) for fastq_file in fastq_pair: with open(fastq_file, "rU") as fastq: for (title, seq, qual) in FastqGeneralIterator(fastq): # Each read is also stored as a dictionary. read = {'name': title.partition(' ')[0], 'seq': seq} read['qual'] = qual if args.qualthresh else [] # Try to match each read with an expected primer. read_bases = read['seq'] primer_key = read_bases[:args.primer_prefix_size] if len(blocks.get(primer_key, [])) == 9: # Possible forward primer matched. fseq = blocks[primer_key][7] if fseq == read_bases[:len(fseq)]: if read['name'] not in blocks[primer_key][3]: blocks[primer_key][3][read['name']] = [read, 0, \ len(fseq), 0, sample] else: blocks[primer_key][3][read['name']][0] = read blocks[primer_key][3][read['name']][2] = len(fseq) elif len(blocks.get(primer_key, [])) == 2: # Possible reverse primer matched. rseq = blocks[primer_key][0] if rseq == read_bases[:len(rseq)]: forward_key = blocks[primer_key][1] if read['name'] not in blocks[forward_key][3]: blocks[forward_key][3][read['name']] = [0, read, \ 0, len(rseq), sample] else: blocks[forward_key][3][read['name']][1] = read blocks[forward_key][3][read['name']][3] = len(rseq) # For the next stage, we only need the actual blocks. result = [b[:5] for b in blocks.values() if len(b) > 2] return result
def _fastq_convert_fasta(in_handle, out_handle, alphabet=None): """Fast FASTQ to FASTA conversion (PRIVATE). Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and Seq objects in order to speed up this conversion. NOTE - This does NOT check the characters used in the FASTQ quality string are valid! """ from Bio.SeqIO.QualityIO import FastqGeneralIterator #For real speed, don't even make SeqRecord and Seq objects! count = 0 for title, seq, qual in FastqGeneralIterator(in_handle): count += 1 out_handle.write(">%s\n" % title) #Do line wrapping for i in range(0, len(seq), 60): out_handle.write(seq[i:i + 60] + "\n") return count