def assignSequences(sets, fileInfo): assert fileInfo.input_format in ('fq', 'fa') if fileInfo.input_format == 'fq': getID, getSeqs, nlines, outfmt = KTIO.getFastqIdentifier, KTIO.readFastq, 4, '%s\n%s\n+\n%s\n' else: getID, getSeqs, nlines, outfmt = KTIO.getFastaIdentifier, KTIO.readFasta, 2, '%s\n%s\n' if fileInfo.gz_output: ffmt = 'gz' elif fileInfo.bz2_output: ffmt = 'bz2' else: ffmt = None R1gen, R2gen = getSeqs(fileInfo.inR1), None R1A, R1B, R1AB, R1U = map(lambda x:KTIO.openFile(x, fmt=ffmt, mode='wb'), [fileInfo.outAR1, fileInfo.outBR1, fileInfo.outABR1, fileInfo.outUR1]) R2A, R2B, R2AB, R2U = None, None, None, None if fileInfo.inR2 is not None: R2gen = getSeqs(fileInfo.inR2) R2A, R2B, R2AB, R2U = map(lambda x:KTIO.openFile(x, fmt=ffmt, mode='wb'), [fileInfo.outAR2, fileInfo.outBR2, fileInfo.outABR2, fileInfo.outUR2]) fxid1, fxid2 = None, None while 1: try: R1rec = R1gen.next() except: break fxid1, fxid2 = getID(R1rec[0]), None if R2gen is not None: try: R2rec = R2gen.next() except: break fxid2 = getID(R2rec[0]) assert fxid1 == fxid2 or fxid2 is None # set order is A, B, AB if fxid1 in sets[0]: dest, destid = (R1A, R2A), 'A' elif fxid1 in sets[1]: dest, destid = (R1B, R2B), 'B' elif fxid1 in sets[2]: dest, destid = (R1AB, R2AB), '+' else: dest, destid = (R1U, R2U), 'U' sys.stdout.write('\t'.join([destid, fxid1]) + '\n') dest[0].write(outfmt % R1rec) if dest[1] is not None: dest[1].write(outfmt % R2rec) map(lambda x:x.close(), [R1A, R1B, R1AB, R1U]) if R2A is not None: map(lambda x:x.close(), [R2A, R2B, R2AB, R2U]) pass
def extractSequences_obsolete(keepSequences, fileInfo): assert fileInfo.input_format in ('fq', 'fa') if fileInfo.input_format == 'fq': getID, getSeqs, nlines = KTIO.getFastqIdentifier, KTIO.readFastq, 4 else: getID, getSeqs, nlines = KTIO.getFastaIdentifier, KTIO.readFasta, 2 if fileInfo.gz_output: ffmt = 'gz' elif fileInfo.bz2_output: ffmt = 'bz2' else: ffmt = None fwdOut, fwdGen = KTIO.openFile(fileInfo.outR1, mode='wb', fmt=ffmt), getSeqs(fileInfo.inR1) revOut, revGen = None, None if args.outR2 is not None and args.inR2 is not None: revOut, revGen = KTIO.openFile(args.outR2, mode='wb', fmt=ffmt), getSeqs(args.inR2) fxid1, fxid2 = None, None while 1: try: fwdRecord = fwdGen.next() except: break fxid1 = getID(fwdRecord[0]) if revGen is not None: try: revRecord = revGen.next() except: break fxid2 = getID(revRecord[0]) assert fxid1 == fxid2 or fxid2 is None if fxid1 in keepSequences: fwdOut.write(('%s\n' * nlines) % fwdRecord) if revOut is not None: revOut.write(('%s\n' * nlines) % revRecord) else: pass fwdOut.close() if revOut is not None: revOut.close() pass
def filterSequences(db, f_inputClassification, keepTaxIDs, allowUnclassified=False, logfile=None): if logfile is not None: [logfile.write('Filtering sequences...\n'), logfile.flush()] assert keepTaxIDs or allowUnclassified nseqs = 0 keepSequences = set() # need to keep track of all dropped sequences in case we want unclassified but used the --only-classified-output switch # when running kraken (unclassified sequences are unmarked and need to be distinguished from unwanted sequences.) dropSequences = set() with KTIO.openFile(f_inputClassification) as fi: for line in fi: nseqs += 1 line = line.strip().split() """ if taxID > 0: # extract reads from branch # don't allow unclassified reads takeUnclassified = allowUnclassified and line[0] == 'U' # only allow reads that have been assigned a taxonomy id belonging to the branch takeClassified = line[0] == 'C' and int(line[2]) in keepTaxIDs elif taxID < 0: # ignore reads from branch # do not allow unclassified reads takeUnclassified = allowUnclassified and line[0] == 'U' # allow only reads that have been assigned a taxonomy id outside of the branch takeClassified = line[0] == 'C' and int(line[2]) not in keepTaxIDs else: # extract unclassified # allow unclassified reads takeUnclassified = allowUnclassified and line[0] == 'U' # don't allow classified reads takeClassified = False """ takeClassified = line[0] == 'C' and int(line[2]) in keepTaxIDs takeUnclassified = allowUnclassified and line[0] == 'U' if takeUnclassified or takeClassified: keepSequences.add(line[1].strip()) elif allowUnclassified: # we want unclassified, but current line was classified and rejected dropSequences.add(line[1].strip()) if logfile is not None: logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs)) logfile.write('Dropping %i of %i sequences (%.1f).\n' % (len(dropSequences), nseqs, float(len(dropSequences))/nseqs)) logfile.flush() return keepSequences, dropSequences
import sys import csv from ktoolu_io import readFastq, openFile # call with samtools view -f2 or on bamfile with only -f2-mapping reads #mappedReads = se() #for row in csv.reader(sys.stdin, delimiter='\t'): # mappedReads.add(row[0]) mappedReads = set(line.strip().split('\t')[0] for line in sys.stdin) fq1 = readFastq(sys.argv[1]) fq2 = readFastq(sys.argv[2]) with openFile(sys.argv[1].replace('.fq.gz', '.unmapped_iwgsc10.fq.gz'), fmt='gz', mode='wt') as fo1, openFile(sys.argv[2].replace( '.fq.gz', '.unmapped_iwgsc10.fq.gz'), fmt='gz', mode='wt') as fo2: while 1: try: _id1, _seq1, _qual1 = next(fq1) except: break _id2, _seq2, _qual2 = next(fq2) if _id1[1:] not in mappedReads and _id2[1:] not in mappedReads: fo1.write('{}\n{}\n+\n{}\n'.format(_id1, _seq1, _qual1)) fo2.write('{}\n{}\n+\n{}\n'.format(_id2, _seq2, _qual2))
def readClassification(fn): with KTIO.openFile(fn) as fi: return set(line.strip().split()[:2] for line in fi if line.strip().startswith('C'))