Ejemplo n.º 1
0
def extractUpstreamSeqs(cloneAnnot,
                        recordFile,
                        upstream,
                        upstreamFile,
                        stream=None):
    """
    extract the upstream DNA sequences and write them into a FASTA file named upstreamFile
    :param cloneAnnot:
                cloneAnnot DataFrame

    :param recordFile:
                raw record file (string)

    :param upstream:
                list of 2 numbers, denoting [start, end] inclusive in 1-index. np.Inf is also allowed for end value

    :param upstreamFile:
                output FASTA filename

    :param stream:
                logging stream object

    :return:
                None
    """
    printto(stream, "\tExtracting the upstream sequences ... ")

    # alignments with - strand
    revAlign = 0
    # num. seqs with trimmed beginning (vstart > 3)
    trimmedBegin = 0
    # num. seqs with sequences shorter than expected upstream length (len(seq) < expectLength)
    # @see expectLength
    trimmedUpstream = 0
    # excluded sequences because end <= 1
    noSeq = 0
    # num. processed sequences
    procSeqs = 0
    # buffer to hold sequences before flushing into file
    recordsBuffer = []
    # max buffer size allowed
    maxBufferSize = int(10.0**5) / 2

    # expected upstream length = expectLegth (end - start + 1) where start,end are both 1-index
    expectLength = upstream[1] - upstream[0] + 1
    queryIds = cloneAnnot.index

    # NOTE: SeqIO.index can only index string filenames and it has to be unzipped
    _, ext = os.path.splitext(os.path.basename(recordFile.rstrip(os.path.sep)))
    records = SeqIO.index(gunzip(recordFile), ext.lstrip('.'))

    with open(upstreamFile, 'w') as fp:
        for id_ in queryIds:
            record = records[id_]
            qsRec = cloneAnnot.loc[record.id]
            if qsRec.strand != 'forward':
                revAlign += 1
                record.seq = record.seq.reverse_complement()
            if qsRec.vstart <= 3:
                end = qsRec.vqstart - upstream[0] - qsRec.vstart + 1
                if end <= 1:
                    noSeq += 1
                else:
                    start = max(1,
                                qsRec.vqstart - upstream[1] - qsRec.vstart + 1)
                    record.seq = record.seq[int(start - 1):int(end)]
                    if expectLength != Inf and len(record.seq) < expectLength:
                        trimmedUpstream += 1
                    record.id = record.id + _UPSTREAM_SEQ_FILE_SEP + qsRec.vgene
                    record.description = ""
                    recordsBuffer.append(record)
                    procSeqs += 1
                    if procSeqs % maxBufferSize == 0:
                        printto(
                            stream,
                            '{}/{} sequences have been processed ... '.format(
                                procSeqs, len(queryIds)))
                        SeqIO.write(recordsBuffer, fp, 'fasta')
                        recordsBuffer = []
            else:
                trimmedBegin += 1

        # flush remaining sequences
        if len(recordsBuffer) > 0:
            printto(
                stream, '{}/{} sequences have been processed ... '.format(
                    procSeqs, len(queryIds)))
            SeqIO.write(recordsBuffer, fp, 'fasta')

    if revAlign > 0:
        printto(
            stream, "\t\t\t{} sequences are in reversed alignment ... ".format(
                revAlign), LEVEL.INFO)

    if trimmedBegin > 0:
        printto(
            stream,
            "\t\t\tThe query sequence is not aligned within 3bp of the IGV start "
            "position ... {} found and excluded!".format(trimmedBegin),
            LEVEL.WARN)

    if trimmedUpstream > 0:
        printto(
            stream,
            "\t\t\tUpstream sequences shorter than the expected length are detected ... {} found"
            .format(trimmedUpstream), LEVEL.WARN)

    if noSeq > 0:
        printto(
            stream,
            "\t\t\tNo upstream sequence can be extracted (too short) for {} sequences."
            .format(noSeq), LEVEL.WARN)
    gc.collect()
Ejemplo n.º 2
0
def refineClonesAnnotation(outDir, sampleName, cloneAnnotOriginal, readFile, format,
                           actualQstart, chain, fr4cut,
                           trim5End, trim3End,
                           seqsPerFile, threads, stream=None):
    printto(stream, "Clone annotation and in-frame prediction are being refined ...")
    seqsPerFile = 100
    cloneAnnot = cloneAnnotOriginal.copy()
    queryIds = cloneAnnot.index
    (refineFlagNames, refineFlagMsgs) = loadRefineFlagInfo()
    records = None
    workers = None
    try:
        # process clones from the FASTA/FASTQ file

        # NOTE:
        # if the readFile is gzipped, we need to unzip it in the same directory before passing into
        # SeqIO.index because it doesn't accept gzipped nor opened files
        records = SeqIO.index(gunzip(readFile), format)
        printto(stream, "\t " + format + " index created and refinement started ...")
        # Parallel implementation of the refinement
        noSeqs = len(queryIds)
        totalTasks = int(ceil(noSeqs / seqsPerFile))
        tasks = Queue()
        exitQueue = Queue()
        resultsQueue = Queue()
        procCounter = ProcCounter(noSeqs, stream=stream)
        threads = min(threads, totalTasks)
        # Initialize workers
        workers = []
        for i in range(threads):
            w = RefineWorker(procCounter, chain, actualQstart, fr4cut,
                             trim5End, trim3End, refineFlagNames, stream=stream)
            w.tasksQueue = tasks
            w.exitQueue = exitQueue
            w.resultsQueue = resultsQueue
            workers.append(w)
            w.start()
            sys.stdout.flush()
            # adding jobs to the tasks queue with subsets of query IDs
        assert (totalTasks >= 1)
        for i in range(totalTasks):
            ids = queryIds[i * seqsPerFile:(i + 1) * seqsPerFile]
            recs = map(lambda x: records[x], ids)
            qsRecs = map(lambda x: cloneAnnot.loc[x].to_dict(), ids)
            tasks.put((recs, qsRecs))
        # Add a poison pill for each worker
        for i in range(threads + 10):
            tasks.put(None)
            # Wait all process workers to terminate                
        i = 0
        while i < threads:
            m = exitQueue.get()
            if m == "exit":
                i += 1
        printto(stream, "All workers have completed their tasks successfully.")
        # Collect results
        printto(stream, "Results are being collated from all workers ...")
        # End of parallel implementation
        sys.stdout.flush()

        # invoking the result collection method
        cloneAnnotList, transSeqs, flags, frameworkLengths = collectRefineResults(resultsQueue, totalTasks,
                                                                                  noSeqs, refineFlagNames,
                                                                                  stream=stream)
        printto(stream, "\tResults were collated successfully.")

        # mark each clone as filtered=yes if its framework len is not the most common among the same V/J germline gene
        printto(stream, "Filtering clones according to framework lengths ... ")
        # display the tallies of all frameworks based on V/J germline gene to log file
        for gene in frameworkLengths:
            printto(stream, "{}:".format(gene), LEVEL.INFO)
            for region, counts in frameworkLengths[gene].items():
                printto(stream, "\t{}: {}".format(region.upper(), str(counts)), LEVEL.INFO)

        # flag them if they're filtered based on FR len
        annotationFields = getAnnotationFields(chain)
        for clone in cloneAnnotList:
            flags['filterFRLength'] += markClones(clone, frameworkLengths, annotationFields)

        filtered = len(flags['filterFRLength'])
        printto(stream, "\t{:.2%} ({}/{}) clones were marked as filtered-out using Framework region 1, 2, 3 "
                        "and 4 lengths".format(filtered / cloneAnnot.shape[0], filtered, cloneAnnot.shape[0]))

        # print refine flags
        printRefineFlags(flags, records, refineFlagNames, refineFlagMsgs, stream=stream)
        printto(stream, "Flagged sequences are being written to an output file ... ")
        writeRefineFlags(flags, records, refineFlagNames, refineFlagMsgs,
                         outDir, sampleName)
    except Exception as e:
        printto(stream, "Something went wrong during the refinement process!", LEVEL.EXCEPT)
        raise e
    finally:
        if workers:
            for w in workers:
                w.terminate()
        if records:
            records.close()

    # Create new data frame of clone annotation
    # add new column for filtering based on FR region
    newColumns = getAnnotationFields(chain) + ['filtered']
    cloneAnnot = DataFrame(cloneAnnotList, columns=newColumns)
    cloneAnnot.set_index('queryid', inplace=True, drop=True)
    gc.collect()

    # Create data frame of FR and CDR sequences
    cols = ['queryid', 'germline', 'fr1', 'cdr1', 'fr2', 'cdr2', 'fr3', 'cdr3', 'fr4']
    cloneSeqs = DataFrame(transSeqs, columns=cols)
    for col in cols:
        cloneSeqs.loc[:, col] = cloneSeqs[col].map(str)
    cloneSeqs.set_index('queryid', inplace=True, drop=True)

    return cloneAnnot, cloneSeqs
Ejemplo n.º 3
0
def collectUpstreamSeqs(upstreamFile,
                        sampleName,
                        expectLength,
                        outResDir,
                        outAuxDir,
                        startCodon=True,
                        type='secsig',
                        plotDist=True,
                        stream=None):
    """
    segregates and plots upstream file sequences. They are segregated as sequences with no start codon,
    faulty sequences (stop codon post translation if type == secsig or X or N nucleotides in the sequence),
    and valid sequences.

    :param upstreamFile: string
                        upstream FASTA file

    :param sampleName: string
                        name of sampel

    :param expectLength: tuple or list
                        index-able of length 2 denoting start and end

    :param outResDir: string
                        name of result output directory

    :param outAuxDir: string
                        name of auxiliary output directory

    :param startCodon: bool
                        whether or not to care about start codons during segregation

    :param type: string
                        either 'secsig' or '5utr'

    :param plotDist: bool
                        whether or not to also save a txt and png file denoting the distribution of segregated sequences

    :param stream: stream
                        debugging stream

    :return: tuple
                        (ighvValidSignals : dict, faultySeqs : dict and noStartCodonSeqs: dict)
    """
    if type not in ['secsig', '5utr']:
        raise ValueError(
            "Unknown parameter type={}, expected one of 'secsig', '5utr'".
            format(type))

    printto(
        stream,
        "\tSequences between {} and {} are being extracted ... ".format(
            expectLength[0], expectLength[1]))

    START_CODON = "ATG"

    # valid sequences
    ighvSignals = defaultdict(list)
    ighvSignalsCounts = defaultdict(int)

    # no start codons
    ighvSignalsNoATG = defaultdict(list)
    noStartCodonCounts = defaultdict(int)

    # faulty translations
    faultyTrans = defaultdict(list)
    faultyTransCounts = defaultdict(int)

    ignoredSeqs = 0

    records = SeqIO.index(gunzip(upstreamFile), 'fasta')
    for id_ in records:
        rec = records[id_]
        ighv = rec.id.split(_UPSTREAM_SEQ_FILE_SEP)[1]
        seq = rec.seq
        if expectLength[0] <= len(rec) <= expectLength[1]:
            if not startCodon or START_CODON in seq:

                if type == 'secsig':
                    seq = seq[:len(seq) -
                              (len(seq) % 3)].translate(to_stop=False)[1:]

                if 'X' in seq or '*' in seq:
                    faultyTrans[ighv].append(rec)
                    faultyTransCounts[ighv] += 1
                elif 'N' not in rec.seq:
                    ighvSignals[ighv].append(rec)
                    ighvSignalsCounts[ighv] += 1
                else:
                    printto(stream,
                            "Ignored: " + str(rec.seq) + ' ' + str(seq))
                    if type == 'secsig':
                        faultyTrans[ighv].append(rec)
                        faultyTransCounts[ighv] += 1
            elif startCodon:
                # START_CODON not in seq
                ighvSignalsNoATG[ighv].append(rec)
                noStartCodonCounts[ighv] += 1
        else:
            ignoredSeqs += 1

    if ignoredSeqs:
        printto(
            stream,
            "\tThere are {} sequences that were ignored because the length of the provided upstream"
            "sequences were not {} <= length(upstream_seqs) <= {}".format(
                ignoredSeqs, *expectLength), LEVEL.WARN)

    if sum(ighvSignalsCounts.values()):
        flattenRecs = list(itertools.chain.from_iterable(ighvSignals.values()))
        assert len(flattenRecs) == sum(ighvSignalsCounts.values())
        title = 'Valid Secretion Signals' if type == 'secsig' else "Valid 5'-UTRs"
        printto(
            stream, "\tThere are {} {} within expected "
            "length ({} to {}) and startCodon={}".format(
                sum(ighvSignalsCounts.values()), title, expectLength[0],
                expectLength[1], startCodon), LEVEL.INFO)
        validSeqFile = os.path.join(
            outAuxDir,
            _VALID_SEQ_FASTA_TEMPLATE.format(sampleName, type, *expectLength))
        SeqIO.write(flattenRecs, validSeqFile, 'fasta')
        if plotDist:
            writeCountsCategoriesToFile(
                ighvSignalsCounts, sampleName,
                os.path.join(
                    outResDir, "{}_{}_{:.0f}_{:.0f}_valid_".format(
                        sampleName, type, expectLength[0], expectLength[1])),
                title)
    if sum(faultyTransCounts.values()):
        flattenRecs = (itertools.chain.from_iterable(faultyTrans.values()))
        assert len(flattenRecs) == sum(faultyTransCounts.values())
        faultySeqFile = os.path.join(
            outAuxDir,
            _FAULTY_SEQ_FASTA_TEMPLATE.format(sampleName, type, *expectLength))
        SeqIO.write(flattenRecs, faultySeqFile, 'fasta')
        if plotDist:
            writeCountsCategoriesToFile(
                faultyTransCounts, sampleName,
                os.path.join(
                    outResDir, "{}_{}_{:.0f}_{:.0f}_faulty_".format(
                        sampleName, type, *expectLength)),
                'Faulty Translations')
        printto(
            stream, "\tTotal faulty secretion signals is {} (excluded)".format(
                len(flattenRecs)), LEVEL.INFO)
        for i in random.choice(range(len(flattenRecs)),
                               min(5, len(flattenRecs)),
                               replace=False):
            sequence = flattenRecs[i].seq
            printto(
                stream, "\t{}\n\tTranslated:{}".format(
                    sequence, sequence[:len(sequence) -
                                       (len(sequence) % 3)].translate()))

    if sum(noStartCodonCounts.values()):
        flattenRecs = list(
            itertools.chain.from_iterable(ighvSignalsNoATG.values()))
        assert len(flattenRecs) == sum(noStartCodonCounts.values())
        noStartCodonFile = os.path.join(
            outAuxDir,
            _STARTCOD_SEQ_FASTA_TEMPLATE.format(sampleName, type,
                                                *expectLength))
        SeqIO.write(flattenRecs, noStartCodonFile, 'fasta')
        if plotDist:
            writeCountsCategoriesToFile(
                noStartCodonCounts, sampleName,
                os.path.join(
                    outResDir, "{}_{}_{:.0f}_{:.0f}_no_atg_".format(
                        sampleName, type, *expectLength)),
                "Upstream sequences without start codon")
        printto(
            stream,
            "\tThere is no ATG codon in {} sequences (excluded)".format(
                len(flattenRecs)), LEVEL.INFO)
        for i in random.choice(range(len(flattenRecs)),
                               min(5, len(flattenRecs)),
                               replace=False):
            printto(stream, "\t{}".format(flattenRecs[i].seq))

    # the output of each ighv key's value should be a list of strings, not SeqRecord object
    for k in ighvSignals:
        ighvSignals[k] = map(lambda x: str(x.seq), ighvSignals[k])
    for k in faultyTrans:
        faultyTrans[k] = map(lambda x: str(x.seq), faultyTrans[k])
    for k in ighvSignalsNoATG:
        ighvSignalsNoATG[k] = map(lambda x: str(x.seq), ighvSignalsNoATG[k])

    return ighvSignals, faultyTrans, ighvSignalsNoATG
Ejemplo n.º 4
0
def addPrimerData(cloneAnnot,
                  readFile,
                  format,
                  fr4cut,
                  trim5end,
                  trim3end,
                  actualQstart,
                  end5,
                  end3,
                  end5offset,
                  threads,
                  stream=None):
    printto(stream, "Primer specificity analysis has begun ...")
    queryIds = cloneAnnot.index
    seqsPerFile = 100
    _addPrimerColumns(cloneAnnot, end5, end3)
    workers = []
    records = SeqIO.index(gunzip(readFile), format)
    newColumns = ['queryid'] + list(cloneAnnot.columns)
    try:
        printto(
            stream,
            "\t " + format + " index created and primer analysis started ...")
        noSeqs = len(queryIds)
        totalTasks = int(ceil(noSeqs * 1.0 / seqsPerFile))
        tasks = Queue()
        exitQueue = Queue()
        resultsQueue = Queue()
        procCounter = ProcCounter(noSeqs, stream=stream)
        threads = min(threads, totalTasks)
        if not hasLargeMem():
            threads = 2
        for _ in range(threads):
            w = PrimerWorker(procCounter,
                             fr4cut,
                             trim5end,
                             trim3end,
                             actualQstart,
                             end5,
                             end3,
                             end5offset,
                             tasks,
                             exitQueue,
                             resultsQueue,
                             stream=stream)
            workers.append(w)
            w.start()
        for i in range(totalTasks):
            ids = queryIds[i * seqsPerFile:(i + 1) * seqsPerFile]
            recs = map(lambda x: records[x], ids)
            qsRecs = map(lambda x: cloneAnnot.loc[x].to_dict(), ids)
            tasks.put((recs, qsRecs))

        # poison pills
        for _ in range(threads + 10):
            tasks.put(None)

        i = 0
        while i < threads:
            m = exitQueue.get()
            i += (m == 'exit')
        printto(stream, "All workers have completed their tasks successfully.")
        printto(stream, "Results are being collated from all workers ...")
        cloneAnnotList = _collectPrimerResults(newColumns,
                                               resultsQueue,
                                               totalTasks,
                                               noSeqs,
                                               stream=stream)
        printto(stream, "Results were collated successfully.")
    except Exception as e:
        printto(
            stream,
            "Something went wrong during the primer specificity analysis!",
            LEVEL.EXCEPT)
        raise e
    finally:
        for w in workers:
            w.terminate()
        records.close()

    primerAnnot = DataFrame(cloneAnnotList, columns=newColumns)
    primerAnnot.set_index('queryid', drop=True, inplace=True)
    return primerAnnot