def count_qualified_seq(self): ''' Update count information based on quality file and blast result Usage: SeqProcessor.count_qualified_seq() ''' if self.blast_out == None: print( 'Please run blast_primer_adaptor() before filtering sequences') sys.exit(1) try: PairedFastaQualIterator(open(self.seq_file), open(self.qual_file)) except: print( 'Error reading sequence file and matched quality file, please double check' ) sys.exit(1) outfile_filtered_seq = open( options.outdir + os.sep + 'filtered_seq.fna', 'w') out_align = open(options.outdir + os.sep + 'alignment.tsv', 'w') out_align.write('seq_id\tstart\tend\talign_to\n') for record in PairedFastaQualIterator(open(self.seq_file), open(self.qual_file)): self.n_reads_total += 1 if self.n_reads_total % 1000 == 0: print("processing read %d ..." % (self.n_reads_total)) ## search and trim primer and adptor has_primer, primer_start, primer_end = self._trim_primer(record) has_adaptor, adaptor_start, adaptor_end = self._trim_adaptor( record) trimed_start = 0 trimed_end = len(record.seq) if has_primer: self.n_reads_with_primer += 1 trimed_start = primer_end out_align.write(record.id + '\t' + str(primer_start) + '\t' + str(primer_end) + '\t' + 'primer\n') if has_adaptor: self.n_reads_with_adaptor += 1 trimed_end = adaptor_start - 1 out_align.write(record.id + '\t' + str(adaptor_start) + '\t' + str(adaptor_end) + '\t' + 'adaptor\n') if has_primer and has_adaptor: self.n_reads_with_primer_adaptor += 1 trimed_seq = record[trimed_start:trimed_end] if len(trimed_seq.seq) > options.length_cutoff: self.n_reads_gt_100 += 1 if np.mean(trimed_seq.letter_annotations["phred_quality"] ) > options.qual_cutoff: self.n_reads_avg_qual_gt_20 += 1 if len(trimed_seq.seq) > options.length_cutoff and \ np.mean(trimed_seq.letter_annotations["phred_quality"]) > options.qual_cutoff: SeqIO.write(trimed_seq, outfile_filtered_seq, "fasta")
def combine_fasta_qual(fas, qual, outfile, cores=8): if outfile.endswith(gz) == False: outfile = outfile + ".gz" with file_transaction(outfile) as tx_out: with open(fas) as fin, open(qual) as qin, open(tx_out, "w") as oh: for rec in PairedFastaQualIterator(fin, qin): SeqIO.write(rec, oh, "fastq") outfile = pigz_outfile(outfile, cores) return outfile
def faqual2fastq(fasta, qual, fastq): global skipCount from Bio.SeqIO.QualityIO import PairedFastaQualIterator with open(fastq, 'w') as output: records = PairedFastaQualIterator(open(fasta), open(qual)) for rec in records: try: SeqIO.write(rec, output, 'fastq') except ValueError: skipCount + 1 return skipCount
def combine(fastq_dir, basename): """ Combine the seq and qual file into fastq """ try: fastafile = open(fastq_dir + '/' + basename + ".seq") qualfile = open(fastq_dir + '/' + basename + ".qual") except IOError: print("Either the file cannot be opened or there is no corresponding") print("seq or quality file for " + basename) sys.exit() rec_iter = PairedFastaQualIterator(fastafile, qualfile) SeqIO.write(rec_iter, open(fastq_dir + '/' + basename + ".fastq", "w"), "fastq")
def convert(input_fn, output_fn, qual_fn=None, input_fmt="fastq", output_fmt="fasta", defaultq=40): def add_phred_quality(records, defaultq): for record in records: if not record.letter_annotations.has_key("phred_quality"): record.letter_annotations["phred_quality"] = \ [defaultq] * len(record) yield record if input_fmt not in CONVERT_INPUT_FMTS: raise ValueError("invalid input format {}".format(input_fmt)) if output_fmt not in CONVERT_OUTPUT_FMTS: raise ValueError("invalid output format {}".format(output_fmt)) if (input_fmt == "fasta-qual") and (qual_fn is None): raise ValueError("output format 'fasta-qual' requires an input " "quality file") # parse records input_handle = open(input_fn, 'rU') if input_fmt == "fasta-qual": qual_handle = open(qual_fn, 'rU') records = PairedFastaQualIterator(input_handle, qual_handle) else: records = SeqIO.parse(input_handle, input_fmt) # write records output_handle = open(output_fn, 'wb') count = SeqIO.write(add_phred_quality(records, defaultq), output_handle, output_fmt) # close files output_handle.close() input_handle.close() if input_fmt == "fasta-qual": qual_handle.close() sys.stdout.write("{:d} sequences converted\n".format(count))
#!/usr/bin/env python import sys from Bio import SeqIO from Bio.SeqIO.QualityIO import PairedFastaQualIterator #Takes a FASTA file, which must have a corresponding .qual file, # and makes a single FASTQ file. if len(sys.argv) == 1: print "Please specify a single FASTA file to convert." sys.exit() filetoload = sys.argv[1] basename = filetoload #Chop the extension to get names for output files if basename.find(".") != -1: basename = '.'.join(basename.split(".")[:-1]) try: fastafile = open(filetoload) qualfile = open(basename + ".qual") except IOError: print "Either the file cannot be opened or there is no corresponding" print "quality file (" + basename + ".qual)" sys.exit() rec_iter = PairedFastaQualIterator(fastafile, qualfile) SeqIO.write(rec_iter, open(basename + ".fastq", "w"), "fastq")
from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq # reads commands files = sys.argv[1] cutoff= int(sys.argv[2]) Npercent= float(sys.argv[3]) # prepare the output file outname=files+".q"+str(cutoff)+".pN"+str(int(Npercent))+".fasta" output_handle = open(outname, "w") # prepare both (fasta and qual) input files indexing countN=[] records = PairedFastaQualIterator(open(files+".fasta"), open(files+".qual")) for record in records: s=list(record) for i in range(len(record.letter_annotations['phred_quality'])): if record.letter_annotations['phred_quality'][i] < cutoff: s[i]="N" snew="".join(s).strip("N") if snew=="": pass else: nbN=snew.count("N") if (float(nbN)/len(snew))< (Npercent/100): countN.append(nbN) newrecord = SeqRecord(Seq(snew,), id=record.id, description="length="+str(len(snew))) SeqIO.write(newrecord, output_handle, "fasta") output_handle.close()
myFastaPath = myPath + "fasta/" myQualityPath = myPath + "qscore/" myVSPath = myPath + "vs/" myFastQPath = myPath + "fastq/" onlyfiles = [ f[:-6] for f in listdir(myFastaPath) if isfile(join(myFastaPath, f)) ] for myFl in onlyfiles: fastaFile = os.path.join(myFastaPath, myFl + '.fasta') qscoreFile = os.path.join(myQualityPath, myFl + '.qscore') vsFile = open(os.path.join(myVSPath, myFl + '.vs')) contam_location = findContamination(vsFile.readlines()) records = PairedFastaQualIterator(open(fastaFile), open(qscoreFile)) handle = open("temp.fastq", "w") count = SeqIO.write(records, handle, "fastq") handle.close() for rec in SeqIO.parse("temp.fastq", "fastq"): out = [rec, "No cuts: ", []] if (contam_location != None): out = cutter(contam_location, rec, fastaFile) else: no_cuts += 1 break if out[0] != None: fastqFile = open(os.path.join(myFastQPath, myFl + '.fastq'), 'w') count = SeqIO.write(out[0], fastqFile, "fastq") if count != 1: print "Error: there can be only one sequence " + fastaFile
#!/usr/bin/env python from Bio import SeqIO from Bio.SeqIO.QualityIO import PairedFastaQualIterator import sys import gzip if len(sys.argv) != 3: print "ERROR: Incorrect number of files" print "Usage:" + sys.argv[0] + " file.fasta file.qual" print "Fastq file will be written to stdout" sys.exit() fasta_in = open(sys.argv[1]) qual_in = open(sys.argv[2]) record_iterator = PairedFastaQualIterator(fasta_in, qual_in) SeqIO.write(record_iterator, sys.stdout, "fastq")
from Bio import SeqIO from Bio.SeqIO.QualityIO import PairedFastaQualIterator # FASTQ > FASTA SeqIO.convert("SRR020192.fastq", "fastq", "SRR020192.fasta", "fasta") # FASTQ > QUAL SeqIO.convert("SRR020192.fastq", "fastq", "SRR020192.qual", "qual") # FASTQ + QUAL > FASTQ fastq2 = open("novo_fastq.fastq", "w") rec = PairedFastaQualIterator(open("SRR020192.fasta"), open("SRR020192.qual")) i = SeqIO.write(rec, fastq2, "fastq") fastq2.close() print "Foram convertidas %i sequencias FASTA + QUAL em formato FASTQ" % i
args.output_dir) + '/' + filename_base + 'T' + str( args.quality_threshold) + 'W' + str(args.window_size) #if args.number_N != sys.maxint: # output_qualityfile += str(args.number_N) output_qualityfile += filename_ext output_quality_handler = open(output_qualityfile, 'w') threshold = float(args.quality_threshold) #number_N = args.number_N # Chop the sequence and quality from Bio.SeqIO.QualityIO import PairedFastaQualIterator from Bio import SeqIO import re re_pattern = re.compile(r'(length=)\d+(.*)') for seq_qual_record in PairedFastaQualIterator(open(args.fastafilename), open(args.qualityfilename)): qual_list = seq_qual_record.letter_annotations['phred_quality'] lhs, rhs = find_chop_position(window_qual(qual_list, args.window_size), args.quality_threshold) if args.debug: print('{0},{1},{2}'.format(seq_qual_record.id, lhs, rhs), file=sys.stderr) if lhs == len(qual_list): if args.debug: print('Sequence ' + seq_qual_record.id + ' is abandoned', file=sys.stderr) continue elif lhs != 0 or rhs != len(qual_list): new_qual = qual_list[lhs:rhs] description = seq_qual_record.description
# In[2]: print(Bio.__version__) # In[3]: # The answers to first 6 questions will be written to a file called summary.txt in appending mode # So we need to delete the existing file at the beginning. if os.path.isfile("./summary.txt"): os.remove("./summary.txt") # In[4]: # Read the fna and qual files into an SeqRecord iterator provided by the BioPython package. paired_fasta_qual_iterator = PairedFastaQualIterator(open("test.fna"), open("test.qual")) # The list of SeqRecord object will be used throughout this script. paired_fasta_qual_list = list(paired_fasta_qual_iterator) # In[5]: # Question 01: Total number of reads in the original dataset tally_register_01 = len(paired_fasta_qual_list) print(tally_register_01) current_output_text = "Total number of reads in the original dataset: " + str( tally_register_01) + "\n" summary_output_file = open("summary.txt", "a+t") summary_output_file.write(current_output_text) summary_output_file.close()
def fastaqual_to_fastq(fastafile, qualfile, title2ids=None): records = PairedFastaQualIterator(fastafile, qualfile, title2ids=title2ids) return records
#!/bin/usr/python #this script convert fna/qual file into fastq file #usage: python 454_to_fastq.py sample.fna sample.qual import sys from Bio import SeqIO from Bio.SeqIO.QualityIO import PairedFastaQualIterator spl = sys.argv[1].split('.') filename = '_'.join(spl[:-1]) + '.fastq' handle = open(filename, "w") #w=write records = PairedFastaQualIterator(open(sys.argv[1]), open(sys.argv[2])) count = SeqIO.write(records, handle, "fastq") handle.close() print "Converted %i records" % count
primer = Seq(args.primer) adaptor = Seq(args.adaptor) web_access = args.web # set to false to only process existing blast results blast_folder = "blast" if not os.path.exists(blast_folder): os.mkdir(blast_folder) blast_result_file = "1.blast_m8.txt" filter_trim_file = "2.filter_trim.fna" primer_adaptor_file = "3.primer_adaptor_loc.txt" # merge fna and qual files and write into one fastq file is not found, otherwise directly parse fastq if not os.path.exists( fastq ): # pair sequence and quality files into one fastq file, skip if available t0 = timeit.default_timer() with open(fna) as f_handle, open(qual) as q_handle: records = PairedFastaQualIterator(f_handle, q_handle) count = SeqIO.write(records, fastq, "fastq") print( f'{count:,} entries were written to {fastq} in {timeit.default_timer()-t0:.2f} seconds.' ) fq = SeqIO.parse( fastq, "fastq" ) # once the fastq is generated, this step directly parse the fastq file # set counters to zeros, initialize filewriters c1, c2, c3, c4, c5, c6, c7 = 0, 0, 0, 0, 0, 0, 0 fb, cb = open(blast_result_file, 'w'), 0 fb.write('\t'.join([ "query", "subject", "%id", "alignment_length", "mismatches", "gap_openings", "query_start", "query_end", "subject_start", "subject_end", "E_value", "bit_score"
#!/usr/bin/env python """ Convert FASTA + QUAL file pairs to a single FASTQ file http://seqanswers.com/forums/showthread.php?t=16925 You can use this script from the shell like this:: $ ./fasta_to_fastaq reads.fna reads.qual reads.fastq """ # The libraries we need # import sys, os from Bio import SeqIO from Bio.SeqIO.QualityIO import PairedFastaQualIterator # Get the shell arguments # fa_path = sys.argv[1] qa_path = sys.argv[2] fq_path = sys.argv[3] # Check that the paths are valid # if not os.path.exists(fa_path): raise Exception("No file at %s." % fa_path) if not os.path.exists(qa_path): raise Exception("No file at %s." % qa_path) # Do it # with open(fq_path, "w") as handle: records = PairedFastaQualIterator(open(fa_path), open(qa_path)) count = SeqIO.write(records, handle, "fastq") # Report success # print "Converted %i records" % count
parser = argparse.ArgumentParser() parser.add_argument("-f", "--inputfasta", type = str, help = "Input Fasta File") parser.add_argument("-q", "--inputqual", type = str, help = "Input Qual File") parser.add_argument("-o", "--outputprefix", type = str, help = "Prefix to Output FastQ File") argsDict = vars(parser.parse_args()) fasta = argsDict["inputfasta"] qual = argsDict["inputqual"] prefix = argsDict["outputprefix"] # Assertions for Required Input assert (fasta is not None), "No Fasta input provided!" assert (qual is not None), "No Qual input provided!" # If No Prefix, Use Same as FASTQ if prefix is None: prefix = ".".join(fasta.split(".")[0:-1]) # # Conversion # # Merge Fasta & Qual into FastQ records = PairedFastaQualIterator(open(fasta), open(qual)) SeqIO.write(records, prefix + ".fastq", "fastq") else: pass