Esempio n. 1
0
def splitFastaFile(fastaFile,
                   totalFiles,
                   seqsPerFile,
                   filesDir,
                   prefix="",
                   ext=".fasta",
                   stream=None):
    if not exists(os.path.join(filesDir, prefix + "part" + str(int(totalFiles)) + ext)) and \
            not exists(os.path.join(filesDir, prefix + "part" + str(int(totalFiles)) + ".out")):
        # Split the FASTA file into multiple chunks
        printto(stream,
                "\tThe clones are distributed into multiple workers .. ")
        if not os.path.isdir(filesDir):
            os.makedirs(filesDir)
        if hasLargeMem():
            with safeOpen(fastaFile) as fp:
                recordsAll = SeqIO.to_dict(SeqIO.parse(fp, 'fasta'))
            queryIds = recordsAll.keys()
        else:
            # SeqIO.index can only open string filenames and they must be unzipped
            recordsAll = SeqIO.index(gunzip(fastaFile), 'fasta')
            # recordsAll.keys() is of type <dictionary-keyiterator object>, need to cast to list
            queryIds = list(recordsAll.keys())
        for i in range(totalFiles):
            ids = queryIds[i * seqsPerFile:(i + 1) * seqsPerFile]
            records = map(lambda x: recordsAll[x], ids)
            out = os.path.join(filesDir, prefix + 'part' + str(i + 1) + ext)
            SeqIO.write(records, out, 'fasta')
Esempio n. 2
0
def writeRefineFlags(flags, records, refineFlagNames, refineFlagMsgs, outDir, sampleName):
    # 8gb buffer size if system has large enough memory (-1 implies system buffer size)
    with open(os.path.join(outDir, sampleName + "_refinement_flagged.txt"), 'w',
              buffering=int(1 << 23) if hasLargeMem() else -1) as flaggedFp, \
            open(os.path.join(outDir, sampleName + "_refinement_flagged_summary.txt"), "w") as summaryFp:
        for f in refineFlagNames:
            if len(flags[f]) > 0:
                summaryFp.write(refineFlagMsgs[f].format(len(flags[f])) + "\n")
                flaggedFp.write("# " + refineFlagMsgs[f].format(len(flags[f])) + "\n")
                for i in range(len(flags[f])):
                    flaggedFp.write(">" + flags[f][i] + "\n")
                    flaggedFp.write(str(records[flags[f][i]].seq) + "\n")
                flaggedFp.write("\n")
Esempio n. 3
0
def compositionLogos(name, clonoTypes, flatClonoTypes, outDir, threads=2, detailed=False, stream=None):
    """

    :param name: string
                sample name

    :param clonoTypes: dict
    dict with key for each V germline, each having keys of FR / CDR region,
    which in turn, each having a value of Counter() where the AA sequences are tallied
    For example:
    {
        'IGHV3-3': { 'FR1': Counter({"FGWSG": 32, ...}),  'CDR1': Counter(...) },
        'IGHV2-1': { ... }
    }

    :param flatClonoTypes: dict
                    dict with keys of FR / CDR region, each having a value of Counter() where the
                    AA sequences are tallied
                    For example:
                    {
                        'FR1': Counter({"FGWSG": 32, ...}),
                        'CDR1': Counter(...)
                    }

    :param outDir: string

    :param threads: int

    :param detailed: bool
                    segregate composition logo plots based on IGV gene, FR and CDR (all genes combined) composition
                    logos will still be plotted. (If set to false, only FR and CDR composition logos)

    :param stream: stream object
                    output stream
    :return:
    """

    logosFolder = os.path.join(outDir, 'composition_logos')
    createIfNot(logosFolder)
    printto(stream, "Generating composition logos ...")
    if detailed:
        argBuffer = []
        for vgerm in clonoTypes:
            regions = clonoTypes[vgerm].keys()
            regions.sort()
            for region in regions:
                if region == 'v':
                    continue
                clonoType = clonoTypes[vgerm][region]
                seqs = clonoType.keys()
                weights = clonoType.values()

                regionDirectory = os.path.join(logosFolder, region.upper())
                createIfNot(regionDirectory)
                filename = os.path.join(regionDirectory, name + "_{}_cumulative_logo.csv"
                                        .format(vgerm.replace(os.path.sep, '_')))
                if hasLargeMem():
                    printto(stream, "\tbuffering {} for {}".format(region, vgerm))
                    argBuffer.append((seqs, weights, region, filename))
                else:
                    printto(stream, "\tgenerating {} for {}".format(region, vgerm))
                    generateCumulativeLogo(seqs, weights, region, filename, stream=stream)

        if len(argBuffer):
            printto(stream, "Asynchronously generating composition logos from buffer ...")
            pool = multiprocessing.Pool(processes=threads)
            # Generate cumulative sequence logos using Toby's approach
            res = [pool.apply_async(generateCumulativeLogo, args=arg) for arg in argBuffer]
            [p.get() for p in res]  # join processes
            pool.close()
            pool.join()
        printto(stream, "Completed composition logos for IGV families")

    # composition logo for a region(CDR,FR) as a combination of all IGV - i.e. not segregated
    regions = flatClonoTypes.keys()     # combined AA counts(V family) of each region into one sum
    regions.sort()
    for region in regions:
        if region == 'v':
            continue
        clonoType = flatClonoTypes[region]
        seqs = clonoType.keys()
        weights = clonoType.values()

        regionDirectory = os.path.join(logosFolder, region.upper())
        createIfNot(regionDirectory)
        filename = os.path.join(regionDirectory, name + "_cumulative_logo.csv")
        generateCumulativeLogo(seqs, weights, region, filename, stream=stream)
Esempio n. 4
0
def generateSeqMotifs(flatClonoTypes, name, outDir, threads=2, stream=None):
    """
    Create motif plots for FR and CDR regions
    :param flatClonoTypes: dict
                    dict with keys of FR / CDR region, each having a value of Counter() where the
                    AA sequences are tallied
                    For example:
                    {
                        'FR1': Counter({"FGWSG": 32, ...}),
                        'CDR1': Counter(...)
                    }

    :param name: string
                    name of sample

    :param outDir: string
                    output directory

    :param threads: int
                    number of threads to use

    :param stream: stream object
                    output stream
    :return: None
    """

    motifsFolder = os.path.join(outDir, 'motifs')
    createIfNot(motifsFolder)

    printto(stream, "Generating motifs ...")

    # create motif logos
    regions = flatClonoTypes.keys()
    regions.sort()
    argBuffer = []
    for region in regions:
        if region == 'v':
            continue

        clonoType = flatClonoTypes[region]
        seqs = clonoType.keys()
        weights = clonoType.values()

        # Generate sequence motif logos using weblogo
        # generate logos without alignment
        filename = os.path.join(motifsFolder, name + ("_{}_motif_logo.png".format(region)))
        alphabet = createAlphabet(align=False, protein=True, extendAlphabet=True)
        if hasLargeMem():
            printto(stream, "\tbuffering data for {} motif".format(region))
            argBuffer.append((seqs, region, alphabet, filename, False, False, True, weights, outDir, threads))
        else:
            printto(stream, "\tgenerating {} motif".format(region))
            generateMotif(seqs, region, alphabet, filename,  align=False,
                          protein=True, weights=weights, outDir=outDir, threads=threads, stream=stream)

        # generate  logos after alignment
        filename = os.path.join(motifsFolder, name + ("_{}_motif_aligned_logo.png".format(region)))
        alphabet = createAlphabet(align=True, protein=True, extendAlphabet=True)
        if hasLargeMem():
            argBuffer.append((seqs, region, alphabet, filename, True, False, True, weights, outDir, threads))
        else:
            generateMotif(seqs, region, alphabet, filename,  align=True,
                          protein=True, weights=weights, outDir=outDir, threads=threads, stream=stream)

    if len(argBuffer):
        printto(stream, "Asynchronously generating motifs from buffer ...")
        pool = multiprocessing.Pool(processes=threads)
        res = [pool.apply_async(generateMotif, args=arg) for arg in argBuffer]
        [p.get() for p in res]
        pool.close()
        pool.join()

    printto(stream, "CDR/FR Motif analysis complete")
Esempio n. 5
0
def addPrimerData(cloneAnnot,
                  readFile,
                  format,
                  fr4cut,
                  trim5end,
                  trim3end,
                  actualQstart,
                  end5,
                  end3,
                  end5offset,
                  threads,
                  stream=None):
    printto(stream, "Primer specificity analysis has begun ...")
    queryIds = cloneAnnot.index
    seqsPerFile = 100
    _addPrimerColumns(cloneAnnot, end5, end3)
    workers = []
    records = SeqIO.index(gunzip(readFile), format)
    newColumns = ['queryid'] + list(cloneAnnot.columns)
    try:
        printto(
            stream,
            "\t " + format + " index created and primer analysis started ...")
        noSeqs = len(queryIds)
        totalTasks = int(ceil(noSeqs * 1.0 / seqsPerFile))
        tasks = Queue()
        exitQueue = Queue()
        resultsQueue = Queue()
        procCounter = ProcCounter(noSeqs, stream=stream)
        threads = min(threads, totalTasks)
        if not hasLargeMem():
            threads = 2
        for _ in range(threads):
            w = PrimerWorker(procCounter,
                             fr4cut,
                             trim5end,
                             trim3end,
                             actualQstart,
                             end5,
                             end3,
                             end5offset,
                             tasks,
                             exitQueue,
                             resultsQueue,
                             stream=stream)
            workers.append(w)
            w.start()
        for i in range(totalTasks):
            ids = queryIds[i * seqsPerFile:(i + 1) * seqsPerFile]
            recs = map(lambda x: records[x], ids)
            qsRecs = map(lambda x: cloneAnnot.loc[x].to_dict(), ids)
            tasks.put((recs, qsRecs))

        # poison pills
        for _ in range(threads + 10):
            tasks.put(None)

        i = 0
        while i < threads:
            m = exitQueue.get()
            i += (m == 'exit')
        printto(stream, "All workers have completed their tasks successfully.")
        printto(stream, "Results are being collated from all workers ...")
        cloneAnnotList = _collectPrimerResults(newColumns,
                                               resultsQueue,
                                               totalTasks,
                                               noSeqs,
                                               stream=stream)
        printto(stream, "Results were collated successfully.")
    except Exception as e:
        printto(
            stream,
            "Something went wrong during the primer specificity analysis!",
            LEVEL.EXCEPT)
        raise e
    finally:
        for w in workers:
            w.terminate()
        records.close()

    primerAnnot = DataFrame(cloneAnnotList, columns=newColumns)
    primerAnnot.set_index('queryid', drop=True, inplace=True)
    return primerAnnot