def start_trim(f, lock, conn, args): '''Start the actual trimming of a file''' if args.gz: fh = gzip.open(f, 'rb') else: fh = open(f, 'r') # set adaptors # adaptors = set_adaptors(args.adaptors, args.min_adaptor_match) # set fqtype # format = genobox_modules.set_filetype(f, args.gz) if format == 'fastq': fqtype = genobox_modules.set_fqtype(f, args.gz) else: raise ValueError('Input not fastq\n') # start trimming file # total = 0 written = 0 for (title, sequence, quality) in FastqGeneralIterator(fh): #lock.acquire() total += 1 (title, sequence, quality) = filter_adaptor(adaptors, args.min_adaptor_match, args.min_length, title, sequence, quality) (title, sequence, quality) = trim_qual(args.min_baseq, args.min_avgq, args.min_length, fqtype, title, sequence, quality) if title != None: if len(sequence) != len(quality): raise ValueError('sequence and quality not of the same length\n%s\n%s\n' % (sequence, quality)) written += 1 conn.send('@%s\n%s\n+\n%s\n' % (title, sequence, quality)) #lock.release() conn.send('Stop') conn.close() return written, total
def check_formats_fq(i, gz, bwa6): '''Checks format of fastq file and returns it''' import genobox_modules # check if fastq and if so mode format = genobox_modules.set_filetype(i, gz) if format != 'fastq': raise ValueError('Input must be fastq') else: fqtype = genobox_modules.set_fqtype(i, gz) return fqtype
def start_trim(f, lock, conn, args): '''Start the actual trimming of a file''' if args.gz: fh = gzip.open(f, 'rb') else: fh = open(f, 'r') # set adaptors # adaptors = set_adaptors(args.adaptors, args.min_adaptor_match) # set fqtype # format = genobox_modules.set_filetype(f, args.gz) if format == 'fastq': fqtype = genobox_modules.set_fqtype(f, args.gz) else: raise ValueError('Input not fastq\n') # start trimming file # total = 0 written = 0 for (title, sequence, quality) in FastqGeneralIterator(fh): #lock.acquire() total += 1 (title, sequence, quality) = filter_adaptor(adaptors, args.min_adaptor_match, args.min_length, title, sequence, quality) (title, sequence, quality) = trim_qual(args.min_baseq, args.min_avgq, args.min_length, fqtype, title, sequence, quality) if title != None: if len(sequence) != len(quality): raise ValueError( 'sequence and quality not of the same length\n%s\n%s\n' % (sequence, quality)) written += 1 conn.send('@%s\n%s\n+\n%s\n' % (title, sequence, quality)) #lock.release() conn.send('Stop') conn.close() return written, total
def __init__(self, f, o, l=25, q=20, m=20, keep_n=False, M=20, a=['GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG', 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT', 'CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT', 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT'], gz=False): self.f = f self.o = o self.l = l self.q = q self.m = m self.keep_n = keep_n self.M = M self.a = a self.paired = False self.interleaved = False self.gz = gz # set adaptors, readtype, fastq format self.adaptors = self.set_adaptors() self.set_readtype() self.format = genobox_modules.set_filetype(self.f[0], self.gz) if self.format == 'fastq': self.fqtype = genobox_modules.set_fqtype(self.f[0], self.gz)
def __init__( self, f, o, l=25, q=20, m=20, keep_n=False, M=20, a=[ 'GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG', 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT', 'CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT', 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT' ], gz=False): self.f = f self.o = o self.l = l self.q = q self.m = m self.keep_n = keep_n self.M = M self.a = a self.paired = False self.interleaved = False self.gz = gz # set adaptors, readtype, fastq format self.adaptors = self.set_adaptors() self.set_readtype() self.format = genobox_modules.set_filetype(self.f[0], self.gz) if self.format == 'fastq': self.fqtype = genobox_modules.set_fqtype(self.f[0], self.gz)