예제 #1
0
    def count_qualified_seq(self):
        '''
        Update count information based on quality file and blast result
        Usage: SeqProcessor.count_qualified_seq()
        '''
        if self.blast_out == None:
            print(
                'Please run blast_primer_adaptor() before filtering sequences')
            sys.exit(1)
        try:
            PairedFastaQualIterator(open(self.seq_file), open(self.qual_file))
        except:
            print(
                'Error reading sequence file and matched quality file, please double check'
            )
            sys.exit(1)

        outfile_filtered_seq = open(
            options.outdir + os.sep + 'filtered_seq.fna', 'w')
        out_align = open(options.outdir + os.sep + 'alignment.tsv', 'w')
        out_align.write('seq_id\tstart\tend\talign_to\n')
        for record in PairedFastaQualIterator(open(self.seq_file),
                                              open(self.qual_file)):
            self.n_reads_total += 1
            if self.n_reads_total % 1000 == 0:
                print("processing read %d ..." % (self.n_reads_total))
            ## search and trim primer and adptor
            has_primer, primer_start, primer_end = self._trim_primer(record)
            has_adaptor, adaptor_start, adaptor_end = self._trim_adaptor(
                record)
            trimed_start = 0
            trimed_end = len(record.seq)
            if has_primer:
                self.n_reads_with_primer += 1
                trimed_start = primer_end
                out_align.write(record.id + '\t' + str(primer_start) + '\t' +
                                str(primer_end) + '\t' + 'primer\n')

            if has_adaptor:
                self.n_reads_with_adaptor += 1
                trimed_end = adaptor_start - 1
                out_align.write(record.id + '\t' + str(adaptor_start) + '\t' +
                                str(adaptor_end) + '\t' + 'adaptor\n')

            if has_primer and has_adaptor:
                self.n_reads_with_primer_adaptor += 1

            trimed_seq = record[trimed_start:trimed_end]
            if len(trimed_seq.seq) > options.length_cutoff:
                self.n_reads_gt_100 += 1
            if np.mean(trimed_seq.letter_annotations["phred_quality"]
                       ) > options.qual_cutoff:
                self.n_reads_avg_qual_gt_20 += 1
            if len(trimed_seq.seq) > options.length_cutoff and \
                np.mean(trimed_seq.letter_annotations["phred_quality"]) > options.qual_cutoff:
                SeqIO.write(trimed_seq, outfile_filtered_seq, "fasta")
예제 #2
0
def combine_fasta_qual(fas, qual, outfile, cores=8):
    if outfile.endswith(gz) == False:
        outfile = outfile + ".gz"

    with file_transaction(outfile) as tx_out:
        with open(fas) as fin, open(qual) as qin, open(tx_out, "w") as oh:
            for rec in PairedFastaQualIterator(fin, qin):
                SeqIO.write(rec, oh, "fastq")
    outfile = pigz_outfile(outfile, cores)
    return outfile
예제 #3
0
파일: amptklib.py 프로젝트: irawand07/amptk
def faqual2fastq(fasta, qual, fastq):
    global skipCount
    from Bio.SeqIO.QualityIO import PairedFastaQualIterator
    with open(fastq, 'w') as output:
        records = PairedFastaQualIterator(open(fasta), open(qual))
        for rec in records:
            try:
                SeqIO.write(rec, output, 'fastq')
            except ValueError:
                skipCount + 1
    return skipCount
예제 #4
0
def combine(fastq_dir, basename):
    """
    Combine the seq and qual file into fastq
    """
    try:
        fastafile = open(fastq_dir + '/' + basename + ".seq")
        qualfile = open(fastq_dir + '/' + basename + ".qual")
    except IOError:
        print("Either the file cannot be opened or there is no corresponding")
        print("seq or quality file for " + basename)
        sys.exit()
    rec_iter = PairedFastaQualIterator(fastafile, qualfile)
    SeqIO.write(rec_iter, open(fastq_dir + '/' + basename + ".fastq", "w"),
                "fastq")
예제 #5
0
def convert(input_fn,
            output_fn,
            qual_fn=None,
            input_fmt="fastq",
            output_fmt="fasta",
            defaultq=40):
    def add_phred_quality(records, defaultq):
        for record in records:
            if not record.letter_annotations.has_key("phred_quality"):
                record.letter_annotations["phred_quality"] = \
                  [defaultq] * len(record)
            yield record

    if input_fmt not in CONVERT_INPUT_FMTS:
        raise ValueError("invalid input format {}".format(input_fmt))

    if output_fmt not in CONVERT_OUTPUT_FMTS:
        raise ValueError("invalid output format {}".format(output_fmt))

    if (input_fmt == "fasta-qual") and (qual_fn is None):
        raise ValueError("output format 'fasta-qual' requires an input "
                         "quality file")

    # parse records
    input_handle = open(input_fn, 'rU')
    if input_fmt == "fasta-qual":
        qual_handle = open(qual_fn, 'rU')
        records = PairedFastaQualIterator(input_handle, qual_handle)
    else:
        records = SeqIO.parse(input_handle, input_fmt)

    # write records
    output_handle = open(output_fn, 'wb')
    count = SeqIO.write(add_phred_quality(records, defaultq), output_handle,
                        output_fmt)

    # close files
    output_handle.close()
    input_handle.close()
    if input_fmt == "fasta-qual":
        qual_handle.close()

    sys.stdout.write("{:d} sequences converted\n".format(count))
예제 #6
0
#!/usr/bin/env python
import sys
from Bio import SeqIO
from Bio.SeqIO.QualityIO import PairedFastaQualIterator

#Takes a FASTA file, which must have a corresponding .qual file,
# and makes a single FASTQ file.

if len(sys.argv) == 1:
    print "Please specify a  single FASTA file to convert."
    sys.exit()

filetoload = sys.argv[1]
basename = filetoload

#Chop the extension to get names for output files
if basename.find(".") != -1:
    basename = '.'.join(basename.split(".")[:-1])

try:
    fastafile = open(filetoload)
    qualfile = open(basename + ".qual")
except IOError:
    print "Either the file cannot be opened or there is no corresponding"
    print "quality file (" + basename + ".qual)"
    sys.exit()

rec_iter = PairedFastaQualIterator(fastafile, qualfile)

SeqIO.write(rec_iter, open(basename + ".fastq", "w"), "fastq")
예제 #7
0
파일: MaskFasta.py 프로젝트: nixnmtm/bioman
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

# reads commands
files = sys.argv[1]
cutoff= int(sys.argv[2])
Npercent= float(sys.argv[3])

# prepare the output file
outname=files+".q"+str(cutoff)+".pN"+str(int(Npercent))+".fasta"
output_handle = open(outname, "w")


# prepare both (fasta and qual) input files indexing
countN=[]
records = PairedFastaQualIterator(open(files+".fasta"), open(files+".qual"))
for record in records:
	s=list(record)
	for i in range(len(record.letter_annotations['phred_quality'])):
		if record.letter_annotations['phred_quality'][i] < cutoff:
			s[i]="N"
	snew="".join(s).strip("N")
	if snew=="":
		 pass
	else:
		nbN=snew.count("N")
		if (float(nbN)/len(snew))< (Npercent/100):
			countN.append(nbN)
			newrecord = SeqRecord(Seq(snew,), id=record.id, description="length="+str(len(snew)))
			SeqIO.write(newrecord, output_handle, "fasta")
output_handle.close()
예제 #8
0
    myFastaPath = myPath + "fasta/"
    myQualityPath = myPath + "qscore/"
    myVSPath = myPath + "vs/"
    myFastQPath = myPath + "fastq/"

    onlyfiles = [
        f[:-6] for f in listdir(myFastaPath) if isfile(join(myFastaPath, f))
    ]

    for myFl in onlyfiles:
        fastaFile = os.path.join(myFastaPath, myFl + '.fasta')
        qscoreFile = os.path.join(myQualityPath, myFl + '.qscore')
        vsFile = open(os.path.join(myVSPath, myFl + '.vs'))

        contam_location = findContamination(vsFile.readlines())
        records = PairedFastaQualIterator(open(fastaFile), open(qscoreFile))
        handle = open("temp.fastq", "w")
        count = SeqIO.write(records, handle, "fastq")
        handle.close()
        for rec in SeqIO.parse("temp.fastq", "fastq"):
            out = [rec, "No cuts: ", []]
            if (contam_location != None):
                out = cutter(contam_location, rec, fastaFile)
            else:
                no_cuts += 1
            break
        if out[0] != None:
            fastqFile = open(os.path.join(myFastQPath, myFl + '.fastq'), 'w')
            count = SeqIO.write(out[0], fastqFile, "fastq")
            if count != 1:
                print "Error: there can be only one sequence " + fastaFile
예제 #9
0
#!/usr/bin/env python

from Bio import SeqIO
from Bio.SeqIO.QualityIO import PairedFastaQualIterator
import sys
import gzip

if len(sys.argv) != 3:
    print "ERROR: Incorrect number of files"
    print "Usage:" + sys.argv[0] + " file.fasta file.qual"
    print "Fastq file will be written to stdout"
    sys.exit()

fasta_in = open(sys.argv[1])
qual_in = open(sys.argv[2])

record_iterator = PairedFastaQualIterator(fasta_in, qual_in)
SeqIO.write(record_iterator, sys.stdout, "fastq")
예제 #10
0
from Bio import SeqIO
from Bio.SeqIO.QualityIO import PairedFastaQualIterator

# FASTQ > FASTA
SeqIO.convert("SRR020192.fastq", "fastq", "SRR020192.fasta", "fasta")

# FASTQ > QUAL
SeqIO.convert("SRR020192.fastq", "fastq", "SRR020192.qual", "qual")

# FASTQ + QUAL > FASTQ
fastq2 = open("novo_fastq.fastq", "w")
rec = PairedFastaQualIterator(open("SRR020192.fasta"), open("SRR020192.qual"))
i = SeqIO.write(rec, fastq2, "fastq")
fastq2.close()

print "Foram convertidas %i sequencias FASTA + QUAL em formato FASTQ" % i
예제 #11
0
        args.output_dir) + '/' + filename_base + 'T' + str(
            args.quality_threshold) + 'W' + str(args.window_size)
    #if args.number_N != sys.maxint:
    #    output_qualityfile += str(args.number_N)
    output_qualityfile += filename_ext
    output_quality_handler = open(output_qualityfile, 'w')

    threshold = float(args.quality_threshold)
    #number_N = args.number_N

    # Chop the sequence and quality
    from Bio.SeqIO.QualityIO import PairedFastaQualIterator
    from Bio import SeqIO
    import re
    re_pattern = re.compile(r'(length=)\d+(.*)')
    for seq_qual_record in PairedFastaQualIterator(open(args.fastafilename),
                                                   open(args.qualityfilename)):
        qual_list = seq_qual_record.letter_annotations['phred_quality']
        lhs, rhs = find_chop_position(window_qual(qual_list, args.window_size),
                                      args.quality_threshold)
        if args.debug:
            print('{0},{1},{2}'.format(seq_qual_record.id, lhs, rhs),
                  file=sys.stderr)

        if lhs == len(qual_list):
            if args.debug:
                print('Sequence ' + seq_qual_record.id + ' is abandoned',
                      file=sys.stderr)
            continue
        elif lhs != 0 or rhs != len(qual_list):
            new_qual = qual_list[lhs:rhs]
            description = seq_qual_record.description
예제 #12
0
# In[2]:

print(Bio.__version__)

# In[3]:

# The answers to first 6 questions will be written to a file called summary.txt in appending mode
# So we need to delete the existing file at the beginning.
if os.path.isfile("./summary.txt"):
    os.remove("./summary.txt")

# In[4]:

# Read the fna and qual files into an SeqRecord iterator provided by the BioPython package.
paired_fasta_qual_iterator = PairedFastaQualIterator(open("test.fna"),
                                                     open("test.qual"))

# The list of SeqRecord object will be used throughout this script.
paired_fasta_qual_list = list(paired_fasta_qual_iterator)

# In[5]:

# Question 01: Total number of reads in the original dataset
tally_register_01 = len(paired_fasta_qual_list)
print(tally_register_01)

current_output_text = "Total number of reads in the original dataset: " + str(
    tally_register_01) + "\n"
summary_output_file = open("summary.txt", "a+t")
summary_output_file.write(current_output_text)
summary_output_file.close()
예제 #13
0
def fastaqual_to_fastq(fastafile, qualfile, title2ids=None):
    records = PairedFastaQualIterator(fastafile, qualfile, title2ids=title2ids)
    return records
예제 #14
0
#!/bin/usr/python
#this script convert fna/qual file into fastq file
#usage: python 454_to_fastq.py sample.fna sample.qual

import sys

from Bio import SeqIO
from Bio.SeqIO.QualityIO import PairedFastaQualIterator
spl = sys.argv[1].split('.')
filename = '_'.join(spl[:-1]) + '.fastq'
handle = open(filename, "w")  #w=write
records = PairedFastaQualIterator(open(sys.argv[1]), open(sys.argv[2]))
count = SeqIO.write(records, handle, "fastq")
handle.close()
print "Converted %i records" % count
예제 #15
0
    primer = Seq(args.primer)
    adaptor = Seq(args.adaptor)
    web_access = args.web  # set to false to only process existing blast results
    blast_folder = "blast"
    if not os.path.exists(blast_folder): os.mkdir(blast_folder)
    blast_result_file = "1.blast_m8.txt"
    filter_trim_file = "2.filter_trim.fna"
    primer_adaptor_file = "3.primer_adaptor_loc.txt"

    # merge fna and qual files and write into one fastq file is not found, otherwise directly parse fastq
    if not os.path.exists(
            fastq
    ):  # pair sequence and quality files into one fastq file, skip if available
        t0 = timeit.default_timer()
        with open(fna) as f_handle, open(qual) as q_handle:
            records = PairedFastaQualIterator(f_handle, q_handle)
            count = SeqIO.write(records, fastq, "fastq")
        print(
            f'{count:,} entries were written to {fastq} in {timeit.default_timer()-t0:.2f} seconds.'
        )
    fq = SeqIO.parse(
        fastq, "fastq"
    )  # once the fastq is generated, this step directly parse the fastq file

    # set counters to zeros, initialize filewriters
    c1, c2, c3, c4, c5, c6, c7 = 0, 0, 0, 0, 0, 0, 0
    fb, cb = open(blast_result_file, 'w'), 0
    fb.write('\t'.join([
        "query", "subject", "%id", "alignment_length", "mismatches",
        "gap_openings", "query_start", "query_end", "subject_start",
        "subject_end", "E_value", "bit_score"
예제 #16
0
#!/usr/bin/env python
"""
Convert FASTA + QUAL file pairs to a single FASTQ file
http://seqanswers.com/forums/showthread.php?t=16925

You can use this script from the shell like this::
$ ./fasta_to_fastaq reads.fna reads.qual reads.fastq
"""

# The libraries we need #
import sys, os
from Bio import SeqIO
from Bio.SeqIO.QualityIO import PairedFastaQualIterator
# Get the shell arguments #
fa_path = sys.argv[1]
qa_path = sys.argv[2]
fq_path = sys.argv[3]
# Check that the paths are valid #
if not os.path.exists(fa_path): raise Exception("No file at %s." % fa_path)
if not os.path.exists(qa_path): raise Exception("No file at %s." % qa_path)
# Do it #
with open(fq_path, "w") as handle:
    records = PairedFastaQualIterator(open(fa_path), open(qa_path))
    count = SeqIO.write(records, handle, "fastq")
# Report success #
print "Converted %i records" % count
	parser = argparse.ArgumentParser()
	parser.add_argument("-f", "--inputfasta", type = str, help = "Input Fasta File")
	parser.add_argument("-q", "--inputqual", type = str, help = "Input Qual File")
	parser.add_argument("-o", "--outputprefix", type = str, help = "Prefix to Output FastQ File")
	argsDict = vars(parser.parse_args())

	fasta = argsDict["inputfasta"]
	qual = argsDict["inputqual"]
	prefix = argsDict["outputprefix"]

	# Assertions for Required Input
	assert (fasta is not None), "No Fasta input provided!"
	assert (qual is not None), "No Qual input provided!"

	#  If No Prefix, Use Same as FASTQ
	if prefix is None: 
		prefix = ".".join(fasta.split(".")[0:-1])
		
	# 
	# Conversion
	#

	# Merge Fasta & Qual into FastQ
	records = PairedFastaQualIterator(open(fasta), open(qual))
	SeqIO.write(records, prefix + ".fastq", "fastq")

else:

	pass