Beispiel #1
0
def assignSequences(sets, fileInfo):
    assert fileInfo.input_format in ('fq', 'fa')
    if fileInfo.input_format == 'fq':
        getID, getSeqs, nlines, outfmt = KTIO.getFastqIdentifier, KTIO.readFastq, 4, '%s\n%s\n+\n%s\n'
    else:
        getID, getSeqs, nlines, outfmt = KTIO.getFastaIdentifier, KTIO.readFasta, 2, '%s\n%s\n'

    if fileInfo.gz_output:
        ffmt = 'gz'
    elif fileInfo.bz2_output:
        ffmt = 'bz2'
    else:
        ffmt = None

    R1gen, R2gen = getSeqs(fileInfo.inR1), None
    R1A, R1B, R1AB, R1U = map(lambda x:KTIO.openFile(x, fmt=ffmt, mode='wb'), [fileInfo.outAR1, fileInfo.outBR1, fileInfo.outABR1, fileInfo.outUR1])
    R2A, R2B, R2AB, R2U = None, None, None, None

    if fileInfo.inR2 is not None:
        R2gen = getSeqs(fileInfo.inR2)
        R2A, R2B, R2AB, R2U = map(lambda x:KTIO.openFile(x, fmt=ffmt, mode='wb'), [fileInfo.outAR2, fileInfo.outBR2, fileInfo.outABR2, fileInfo.outUR2])

    fxid1, fxid2 = None, None
    while 1:
        try:
            R1rec = R1gen.next()
        except:
            break
        fxid1, fxid2 = getID(R1rec[0]), None

        if R2gen is not None:
            try:
                R2rec = R2gen.next()
            except:
                break
            fxid2 = getID(R2rec[0])

        assert fxid1 == fxid2 or fxid2 is None

        # set order is A, B, AB
        if fxid1 in sets[0]:
            dest, destid = (R1A, R2A), 'A'
        elif fxid1 in sets[1]:
            dest, destid = (R1B, R2B), 'B'
        elif fxid1 in sets[2]:
            dest, destid = (R1AB, R2AB), '+'
        else:
            dest, destid = (R1U, R2U), 'U'

        sys.stdout.write('\t'.join([destid, fxid1]) + '\n')

        dest[0].write(outfmt % R1rec)
        if dest[1] is not None:
            dest[1].write(outfmt % R2rec)

    map(lambda x:x.close(), [R1A, R1B, R1AB, R1U])
    if R2A is not None:
        map(lambda x:x.close(), [R2A, R2B, R2AB, R2U])

    pass
Beispiel #2
0
def extractSequences_obsolete(keepSequences, fileInfo):
    assert fileInfo.input_format in ('fq', 'fa')
    if fileInfo.input_format == 'fq':
        getID, getSeqs, nlines = KTIO.getFastqIdentifier, KTIO.readFastq, 4
    else:
        getID, getSeqs, nlines = KTIO.getFastaIdentifier, KTIO.readFasta, 2

    if fileInfo.gz_output:
        ffmt = 'gz'
    elif fileInfo.bz2_output:
        ffmt = 'bz2'
    else:
        ffmt = None

    fwdOut, fwdGen = KTIO.openFile(fileInfo.outR1, mode='wb', fmt=ffmt), getSeqs(fileInfo.inR1)
    revOut, revGen = None, None

    if args.outR2 is not None and args.inR2 is not None:
        revOut, revGen = KTIO.openFile(args.outR2, mode='wb', fmt=ffmt), getSeqs(args.inR2)

    fxid1, fxid2 = None, None
    while 1:
        try:
            fwdRecord = fwdGen.next()
        except:
            break
        fxid1 = getID(fwdRecord[0])
        if revGen is not None:
            try:
                revRecord = revGen.next()
            except:
                break
            fxid2 = getID(revRecord[0])

        assert fxid1 == fxid2 or fxid2 is None

        if fxid1 in keepSequences:
            fwdOut.write(('%s\n' * nlines) % fwdRecord)
            if revOut is not None:
                revOut.write(('%s\n' * nlines) % revRecord)
        else:
            pass
    fwdOut.close()
    if revOut is not None:
        revOut.close()

    pass
Beispiel #3
0
def filterSequences(db, f_inputClassification, keepTaxIDs, allowUnclassified=False, logfile=None):
    if logfile is not None:
        [logfile.write('Filtering sequences...\n'), logfile.flush()]
    assert keepTaxIDs or allowUnclassified
    nseqs = 0
    keepSequences = set()
    # need to keep track of all dropped sequences in case we want unclassified but used the --only-classified-output switch 
    # when running kraken (unclassified sequences are unmarked and need to be distinguished from unwanted sequences.)
    dropSequences = set()

    with KTIO.openFile(f_inputClassification) as fi:
        for line in fi:
            nseqs += 1
            line = line.strip().split()

            """
            if taxID > 0:
                # extract reads from branch
                # don't allow unclassified reads
                takeUnclassified = allowUnclassified and line[0] == 'U'
                # only allow reads that have been assigned a taxonomy id belonging to the branch
                takeClassified = line[0] == 'C' and int(line[2]) in keepTaxIDs
            elif taxID < 0:
                # ignore reads from branch
                # do not allow unclassified reads
                takeUnclassified = allowUnclassified and line[0] == 'U'
                # allow only reads that have been assigned a taxonomy id outside of the branch
                takeClassified = line[0] == 'C' and int(line[2]) not in keepTaxIDs
            else:
                # extract unclassified
                # allow unclassified reads
                takeUnclassified = allowUnclassified and line[0] == 'U'
                # don't allow classified reads
                takeClassified = False
            """
            takeClassified = line[0] == 'C' and int(line[2]) in keepTaxIDs
            takeUnclassified = allowUnclassified and line[0] == 'U'

            if takeUnclassified or takeClassified:
                keepSequences.add(line[1].strip())
            elif allowUnclassified:
                # we want unclassified, but current line was classified and rejected
                dropSequences.add(line[1].strip())

        if logfile is not None:
            logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs))
            logfile.write('Dropping %i of %i sequences (%.1f).\n' % (len(dropSequences), nseqs, float(len(dropSequences))/nseqs))
            logfile.flush()

    return keepSequences, dropSequences
Beispiel #4
0
import sys
import csv

from ktoolu_io import readFastq, openFile

# call with samtools view -f2 or on bamfile with only -f2-mapping reads
#mappedReads = se()
#for row in csv.reader(sys.stdin, delimiter='\t'):
#    mappedReads.add(row[0])
mappedReads = set(line.strip().split('\t')[0] for line in sys.stdin)

fq1 = readFastq(sys.argv[1])
fq2 = readFastq(sys.argv[2])

with openFile(sys.argv[1].replace('.fq.gz', '.unmapped_iwgsc10.fq.gz'),
              fmt='gz',
              mode='wt') as fo1, openFile(sys.argv[2].replace(
                  '.fq.gz', '.unmapped_iwgsc10.fq.gz'),
                                          fmt='gz',
                                          mode='wt') as fo2:
    while 1:
        try:
            _id1, _seq1, _qual1 = next(fq1)
        except:
            break
        _id2, _seq2, _qual2 = next(fq2)

        if _id1[1:] not in mappedReads and _id2[1:] not in mappedReads:
            fo1.write('{}\n{}\n+\n{}\n'.format(_id1, _seq1, _qual1))
            fo2.write('{}\n{}\n+\n{}\n'.format(_id2, _seq2, _qual2))
Beispiel #5
0
def readClassification(fn):
    with KTIO.openFile(fn) as fi:
        return set(line.strip().split()[:2] for line in fi if line.strip().startswith('C'))