Esempio n. 1
0
def writeClonoTypesToFile(clonoTypes,
                          filename,
                          top=100,
                          overRepresented=True,
                          stream=None):
    if exists(filename):
        printto(
            stream, "\tThe clonotype file " + os.path.basename(filename) +
            " was found!", LEVEL.WARN)
        return

    total = sum(clonoTypes.values()) * 1.0
    dic = defaultdict(list)
    t = 0
    for k in sorted(clonoTypes, key=clonoTypes.get, reverse=overRepresented):
        dic['Clonotype'].append(str(k))
        dic['Count'].append(clonoTypes[k])
        dic['Percentage (%)'].append(clonoTypes[k] / total * 100)
        t += 1
        if t >= top:
            break

    df = DataFrame(dic)
    # fixed format (fast read/write) sacrificing search
    # (should change to table format(t) if search is needed for clonotype clustering/comparison)
    # df.to_hdf(filename, "clonotype", mode="w", format="f")
    df.to_csv(filename + ".gz", mode="w", compression="gzip")
    printto(
        stream,
        "\tA clonotype file has been written to " + os.path.basename(filename))
Esempio n. 2
0
def flattenClonoTypeCountsDict(clonoTypes, stream=None):
    """
    reduces something of this structure:
            'IGHV1-3': {
                'FR1': { 'FWGCGC': 12, 'EVILK': 1, ... }
                'CDR1': { 'FWGCGC': 12, 'EVILK': 1, ... }
            },
            'IGHV2-3': {
                'FR1' : { 'FWGCGC': 12, 'EVILK': 1, ... }
                'CDR1': { 'FWGCGC': 12, 'EVILK': 1, ... }
            }, ...
    to this:
            {
                'FR1': { 'FWGCGC': 24, 'EVILK': 2, ... }
                'CDR1': { 'FWGCGC': 24, 'EVILK': 2, ... }
            }

    :param clonoTypes: dict
                input nested dictionary

    :return: dict
            flattened dictionary
    """
    printto(stream, "Compressing clonotype table ... discarding IGV information ...")
    flattened = defaultdict(Counter)
    for geneName in clonoTypes:
        for region, counts in clonoTypes[geneName].items():
            flattened[region] += Counter(counts)
    printto(stream, "Finish compressing clonotype table")
    return flattened
Esempio n. 3
0
def splitFastaFile(fastaFile,
                   totalFiles,
                   seqsPerFile,
                   filesDir,
                   prefix="",
                   ext=".fasta",
                   stream=None):
    if not exists(os.path.join(filesDir, prefix + "part" + str(int(totalFiles)) + ext)) and \
            not exists(os.path.join(filesDir, prefix + "part" + str(int(totalFiles)) + ".out")):
        # Split the FASTA file into multiple chunks
        printto(stream,
                "\tThe clones are distributed into multiple workers .. ")
        if not os.path.isdir(filesDir):
            os.makedirs(filesDir)
        if hasLargeMem():
            with safeOpen(fastaFile) as fp:
                recordsAll = SeqIO.to_dict(SeqIO.parse(fp, 'fasta'))
            queryIds = recordsAll.keys()
        else:
            # SeqIO.index can only open string filenames and they must be unzipped
            recordsAll = SeqIO.index(gunzip(fastaFile), 'fasta')
            # recordsAll.keys() is of type <dictionary-keyiterator object>, need to cast to list
            queryIds = list(recordsAll.keys())
        for i in range(totalFiles):
            ids = queryIds[i * seqsPerFile:(i + 1) * seqsPerFile]
            records = map(lambda x: recordsAll[x], ids)
            out = os.path.join(filesDir, prefix + 'part' + str(i + 1) + ext)
            SeqIO.write(records, out, 'fasta')
Esempio n. 4
0
def writeDAbundanceToFiles(stats, sampleName, outDir, stream=None):
    igdDist = Counter(stats["dgene"].tolist())
    igdDist = Counter(dict([(str(k), igdDist[k]) for k in igdDist]))
    if len(igdDist) == 0:
        printto(stream, "WARNING: No IGD hits were detected.", LEVEL.WARN)
        return

    # Write the counts of all IGVs into a text file
    # This isn't plotted by default, but we still write the csv file for it
    classes = sorted(igdDist, key=igdDist.get, reverse=True)
    total = sum(igdDist.values()) * 1.0
    writeCSV(os.path.join(outDir, sampleName + '_igd_dist_variant_level.csv'),
             "x,y\n", "{},{}\n", [(x, y) for x, y in zip(
                 classes, map(lambda k: (igdDist[k] / total * 100), classes))])

    # Group IGVs based on the subfamilies (gene level) and then write into a text file
    igdDistSub = compressCountsGeneLevel(igdDist)
    plotDist(igdDistSub,
             sampleName,
             os.path.join(outDir, sampleName + '_igd_dist_gene_level.csv'),
             rotateLabels=False,
             vertical=False,
             title='IGD Abundance in Sample ' + sampleName,
             stream=stream)

    # Group IGVs based on the families and then write into a text file
    igdDistfam = compressCountsFamilyLevel(igdDistSub)
    # Plot the family level distribution
    plotDist(igdDistfam,
             sampleName,
             os.path.join(outDir, sampleName + '_igd_dist_family_level.csv'),
             title='IGD Abundance in Sample ' + sampleName,
             stream=stream)
Esempio n. 5
0
def writeJAbundanceToFiles(stats, sampleName, outDir, stream=None):
    igjDist = Counter(stats["jgene"].tolist())
    igjDist = dict([(str(k), igjDist[k]) for k in igjDist])
    if len(igjDist) == 0:
        printto(stream, "WARNING: No IGJ hits were detected.", LEVEL.WARN)
        return

    plotDist(igjDist,
             sampleName,
             os.path.join(outDir, sampleName + '_igj_dist_variant_level.csv'),
             rotateLabels=False,
             vertical=False,
             stream=stream)

    # Group IGVs based on the subfamilies (gene level) and then write into a text file
    igjDistSub = compressCountsGeneLevel(igjDist)
    #     plotDist(igjDistSub, sampleName, outDir + sampleName +
    #              '_igj_dist_gene_level.csv', rotateLabels=False, vertical=False)
    #
    # Group IGVs based on the families and then write into a text file
    igjDistfam = compressCountsFamilyLevel(igjDistSub)
    # Plot the family level distribution
    plotDist(igjDistfam,
             sampleName,
             os.path.join(outDir, sampleName + '_igj_dist_family_level.csv'),
             title='IGJ Abundance in Sample ' + sampleName,
             stream=stream)
Esempio n. 6
0
def estimateDiversity(clonoTypes,
                      flatClonoTypes,
                      name,
                      outDir,
                      threads=2,
                      segregate=False,
                      stream=None):
    # create Germline gene level composition logos
    compositionLogos(name,
                     clonoTypes,
                     flatClonoTypes,
                     outDir,
                     threads=threads,
                     detailed=segregate,
                     stream=stream)
    generateSeqMotifs(flatClonoTypes,
                      name,
                      outDir,
                      threads=threads,
                      stream=stream)
    generateRarefactionPlots(flatClonoTypes,
                             name,
                             outDir,
                             threads=threads,
                             stream=stream)
    printto(stream, "The diversity of the library is being estimated ... ")
Esempio n. 7
0
 def run(self):
     while True:
         nextTask = self.tasksQueue.get()
         # poison pill check
         if nextTask is None:
             printto(self.stream, "process has stopped ... " + self.name)
             self.exitQueue.put("exit")
             #                 self.terminate()
             break
         try:
             result = analyzeSmallFile(nextTask,
                                       self.chain,
                                       self.igBlastDB,
                                       self.seqType,
                                       self.threads,
                                       domainSystem=self.domainSystem,
                                       stream=self.stream)
             self.resultsQueue.put(result)
         except Exception:
             printto(
                 self.stream, "An error occurred while processing " +
                 os.path.basename(nextTask), LEVEL.EXCEPT)
             self.resultsQueue.put(None)
             continue
     return
def calcRSAOverlapOrder2(order1, sites, stream=None):
    """
    returns a n by n matrix where n is len(sites) of jaccard index

    :param order1: dictionary of sets of ids
    :param sites: collection of enzymes
    :param stream: logging stream
    :return: n by n dataframe that has the form of a named matrix:

         enz1 enz2 enz3
    enz1    1  0.3  0.4
    enz2  0.3    1  0.5
    enz3  0.5  0.5    1
    """
    printto(
        stream,
        "The 2nd order overlapping matrix is being calculated using Jaccard Index ... "
    )
    overlap = []
    for site1 in sites:
        overlap.append([])
        for site2 in sites:
            inter = len(order1[site1].intersection(order1[site2]))
            uni = len(order1[site1].union(order1[site2]))
            if uni != 0:
                overlap[-1].append(inter / uni)
            else:
                overlap[-1].append(1)
    overlap = DataFrame(overlap, columns=sites, index=sites)
    # overlap = linkage(overlap)
    return overlap
def collectRSAResults(sitesInfo,
                      resultsQueue,
                      totalTasks,
                      noSeqs,
                      simple=True,
                      stream=None):
    stats = initRSAStats(simple=simple)
    total = 0
    while totalTasks:
        statsi = resultsQueue.get()
        if statsi is None:
            continue
        totalTasks -= 1

        # -------- update relevant statistics -------  #

        # 1. total number of sequences that are cut by any sites at all (i.e. number of sequences that are cut by
        #    *at least* one site)
        stats["seqsCutByAny"] += statsi["seqsCutByAny"]
        for site in sitesInfo.keys():
            # 2. total number of "possible hits" of this 'site' on all sequences (note, multi-hits are counted)
            stats["siteHitsCount"][site] += statsi["siteHitsCount"][site]
            # 3. total number of "hits" of this 'site' on all sequences (note, multi-hits on one sequence are still
            #    counted as one, not multi) - this is a "duplicate" field of siteHitsSeqsIDs, we could've taken the
            #    length of sitHitsSeqsIDs, it would be equal to this. This is left here from legacy code.
            stats["siteHitSeqsCount"][site] += statsi["siteHitSeqsCount"][site]
            # 4. the ids of which this site has at least one match, this length of this value should be equal to
            #    siteHitSeqsCount
            stats['siteHitsSeqsIDs'][site] = stats["siteHitsSeqsIDs"][
                site].union(statsi["siteHitsSeqsIDs"][site])

            if not simple:
                # these keys are only available for detailed RS analysis

                # 5. collect the total number of region where a match with this site has been registered
                # Counter object
                stats['hitRegion'][site] += statsi['hitRegion'][site]

                # 6. collect all the germline sequences that were recorded during a match with this site
                # list object
                stats['siteHitSeqsGermline'][site] += statsi[
                    'siteHitSeqsGermline'][site]

                # 7. collect all the IGV sequences that were recorded during a match with this site
                # set object
                stats['siteHitsSeqsIGV'][site] = stats['siteHitsSeqsIGV'][
                    site].union(statsi['siteHitsSeqsIGV'][site])

        total += statsi["total"]
        if total % 50000 == 0:
            printto(
                stream,
                '\t%d/%d records have been collected ... ' % (total, noSeqs))

    printto(stream,
            '\t%d/%d sequences have been collected ... ' % (total, noSeqs))

    assert total == noSeqs
    stats["total"] = noSeqs
    return stats
Esempio n. 10
0
def readSeqFileIntoDict(seqFile, outDict=None, stream=None):
    printto(stream, "Processing {} ... loading sequences into dictionary".format(os.path.basename(seqFile)))
    format = detectFileFormat(seqFile)
    if outDict is None:
        outDict = {}
    with safeOpen(seqFile) as fp:
        for rec in SeqIO.parse(fp, format):
            outDict[rec.id] = str(rec.seq)
    return outDict
Esempio n. 11
0
def extractProteinFrag(protein,
                       start,
                       end,
                       offset=0,
                       trimAtStop=False,
                       stream=None):
    """
    Extract a protein fragment from a protein sequence based on DNA positions
    start and end are 1-based

    :param protein:
    :param start:
    :param end:
    :param offset:
    :param trimAtStop:
    :param stream:
    :return:
    """
    if isnan(start) or isnan(end):
        return ''
    if start != -1 and end != -1 and end - start < 1:
        return ''
    # start and end are 1-based positions
    start = (start - offset) if start != -1 else start
    end = (end - offset) if end != -1 else end
    try:
        if start != -1:
            # s = int(round((start  - 1.0 ) / 3))# 0-based
            s = int(((start - 1) / 3))  # 0-based
        else:
            s = 0
        if end != -1:
            # e = int(round( (end*1.0)  / 3)) # 1-based
            e = int(((end) / 3))  # 1-based
        else:
            e = len(protein)
        if (s + 1) < e:
            frag = protein[s:e]
        elif (s + 1) == e:
            frag = protein[s]
        else:
            return ''
        if trimAtStop and ('*' in frag):
            frag = frag[:frag.index('*')]
        return frag
    except:
        printto(
            stream, "ERROR at Extract Protein Fragment {} {} {}".format(
                protein, start, end), LEVEL.ERR)
        return None
Esempio n. 12
0
def findMotifClusters(ighvMotifs, outputPrefix, stream=None):
    from TAMO.Clustering.UPGMA import UPGMA
    from TAMO.Clustering.UPGMA import DFUNC
    from TAMO.Clustering.UPGMA import print_tree_id
    # cluster using a variant of the UPGMA algorithm implemented in the TAMO package
    
    motifsFile = os.path.abspath(outputPrefix + '_motifs.tamo')
    if not exists(motifsFile):
        if len(ighvMotifs) > 0:
            pickle.dump(ighvMotifs, open(motifsFile, 'wb'))            
    else:
        ighvMotifs = pickle.load(open(motifsFile, 'rb'))

    prefixName, sampleName = os.path.split(outputPrefix)
    dendrogramDirectory = os.path.join(prefixName, 'dendrograms')
    if not exists(dendrogramDirectory):
        os.makedirs(dendrogramDirectory)

    if len(ighvMotifs) > 0:
        groupedMotifs = defaultdict(list)
        for m in ighvMotifs:
            ighv = m.id.split('-')[0].split('/')[0]
            groupedMotifs[ighv].append(m)
        try:
            motifClustersFile = os.path.join(dendrogramDirectory, sampleName + '_pwm_clusters.txt')

            _old_stdout = sys.stdout
            sys.stdout = open(motifClustersFile, 'w')

            for ighv in groupedMotifs.keys():
                newickdendrogramFile = os.path.join(dendrogramDirectory, sampleName + '_{}_newick.dnd'.format(ighv))
                tree = UPGMA(groupedMotifs[ighv], DFUNC)
                print_tree_id(tree)

                saveNewickdendrogram(newickdendrogramFile, tree, sys.stdout, title=(ighv + " family clustering"), logger=stream)

            lists = groupedMotifs.values()
            tree = UPGMA([m for lst in lists for m in lst], DFUNC)
            print_tree_id(tree)

            newickdendrogramFile = os.path.join(dendrogramDirectory, sampleName + '_newick.dnd')
            saveNewickdendrogram(newickdendrogramFile, tree, sys.stdout, title="Clustering of all IGHV", logger=stream)

            sys.stdout.close()
            sys.stdout = _old_stdout

            printto(stream, "\tMotif clusters were written to " + os.path.basename(motifClustersFile))
        except Exception as e:
            printto(stream, "Motifs couldn't be clustered! Error: {}".format(str(e)), LEVEL.ERR)
Esempio n. 13
0
def writeClonotypeDiversityRegionAnalysis(clonoTypes,
                                          sampleName,
                                          outDir,
                                          stream=None):
    """
    For a given set of similar CDR3 clonotypes, it may be classified as a different clonotype if the entire V region
    is considered. This writes the unique counts of other region aside form CDR3s to see if the clonotype will differ
    if the entire V region is considered. Consequently, it's possible to learn which region is (mostly)
    the one responsible of changing the clonotype if it was included.
    :param clonoTypes: DataFrame of clonotypes per read. Requires the CDRs and FRs columns
    :param sampleName: Sample name for output file
    :param outDir: Out directory for output file
    :param stream: debug stream
    :return: None. Produces an output gzipped csv file
    """
    fname = os.path.join(
        outDir, sampleName + "_clonotype_diversity_region_analysis.csv.gz")
    if os.path.exists(fname):
        printto(stream, "\t File found {}".format(fname), LEVEL.WARN)
        return

    # regions of analysis
    cols = ["cdr1", "cdr2", "fr1", "fr2", "fr3", "fr4"]

    def regionCounts(selectedRows):
        """ returns a list of numbers that corresponds to the frequency of *UNIQUE* "CDR1", "CDR2", .. "FR4"
        (in the order of cols as defined above)
        :param selectedRows: this "DataFrame" of rows should have the same CDR3 region
        :return: a list of numbers, each representing the number of unique region in the order of
        COLS as defined above
        """
        return [str(len(set(selectedRows[region]))) for region in cols]

    # obtain all CDR3s
    cdr3s = set(clonoTypes['cdr3'])

    with gzip.open(fname, "wb") as fp:
        writeBuffer = ""
        # write csv header
        writeBuffer += "cdr3,count," + ','.join(cols) + "\n"
        # for each unique CDR3, find all rows(reads) that have the same CDR3
        for cdr3 in cdr3s:
            rows = clonoTypes[clonoTypes['cdr3'] == cdr3]
            writeBuffer += cdr3 + "," + str(len(rows)) + "," + ','.join(
                regionCounts(rows)) + '\n'
            if len(writeBuffer) > 4e9:
                fp.write(writeBuffer)
                writeBuffer = ""
        fp.write(writeBuffer)
Esempio n. 14
0
    def run(self):
        printto(self.stream,
                self.name + " process is now ready to start a new job ...")
        while True:
            nextTask = self.tasksQueue.get()
            # poison pill check
            if nextTask is None:
                printto(self.stream, self.name + " process has stopped.")
                self.exitQueue.put("exit")
                break
            try:
                if not self.firstJobTaken:
                    printto(self.stream,
                            self.name + " process commenced a new task ... ")
                    self.firstJobTaken = True
                qsRecs = []
                seqsAll = []
                recordLengths = defaultdict(_defaultdefaultInt)
                flags = {}
                for f in self.refineFlagNames:
                    flags[f] = []
                for (record, qsRec) in zip(nextTask[0], nextTask[1]):
                    seqs = refineCloneAnnotation(qsRec,
                                                 record,
                                                 self.actualQstart,
                                                 self.chain,
                                                 self.fr4cut,
                                                 self.trim5End,
                                                 self.trim3End,
                                                 flags,
                                                 stream=self.stream)
                    # out-of-frame clones are excluded
                    if qsRec['v-jframe'] != 'Out-of-frame':
                        stillInFrame = refineInFramePrediction(
                            qsRec,
                            record,
                            self.actualQstart,
                            flags,
                            stream=self.stream)
                        if stillInFrame:
                            _recordFRLength(qsRec, recordLengths)

                    # append the FR and CDR protein clones
                    qsRec['queryid'] = record.id
                    qsRecs.append(
                        convertCloneRecordToOrderedList(qsRec, self.chain))
                    seqsAll.append(seqs)
                self.procCounter.increment(len(qsRecs))
                self.resultsQueue.put((qsRecs, seqsAll, flags, recordLengths))
            except Exception as e:
                printto(self.stream,
                        "An error occurred while processing " + self.name,
                        LEVEL.EXCEPT)
                self.resultsQueue.put(None)
                continue
        return
Esempio n. 15
0
def _parsePrimerFile(primerFile, stream=None):
    if primerFile:
        primerids = []
        primerLengths = []
        primerSequences = []
        for rec in SeqIO.parse(primerFile, "fasta"):
            primerLengths.append(len(rec.seq))
            primerids.append(rec.id)
            primerSequences.append(str(rec.seq).upper())

        maxScores = calMaxIUPACAlignScores(primerSequences)

        if len(set(primerLengths)) != 1:
            printto(stream, "WARNING: Provided primer file {} has primers with different length. "
                            "Analysis assumes uniform primer length"
                  .format(primerFile), LEVEL.WARN)
        return max(primerLengths), zip(primerids, primerSequences, maxScores)

    return None, None
Esempio n. 16
0
def writeClonoTypesToFiles(clonoTypes, name, outDir, topClonotypes=100, stream=None):
    printto(stream, "Clonotype files are being written out ... ")
    cloneFolder = os.path.join(outDir, "clonotypes")

    if not os.path.exists(cloneFolder):
        os.makedirs(cloneFolder)

    for k in clonoTypes.keys():
        # check if the required topClonotypes went overboard, if so, cap to the max length
        if topClonotypes != float('inf') and len(clonoTypes[k]) < topClonotypes:
            stringTopClonotypes = str(len(clonoTypes[k]))
        else:
            stringTopClonotypes = 'all' if topClonotypes == float('inf') else str(topClonotypes)

        # descending order
        filename = os.path.join(cloneFolder, name + ("_{}_clonotypes_{}_over.csv".format(k, stringTopClonotypes)))
        writeClonoTypesToFile(clonoTypes[k], filename, topClonotypes, overRepresented=True)

        # ascending order
        filename = os.path.join(cloneFolder, name + ("_{}_clonotypes_{}_under.csv".format(k, stringTopClonotypes)))
        writeClonoTypesToFile(clonoTypes[k], filename, topClonotypes, overRepresented=False)
Esempio n. 17
0
    def run(self):
        while True:
            nextTask = self.taskQueue.get()
            if nextTask is None:
                printto(self.stream, self.name + " process has stopped.")
                self.exitQueue.put("exit")
                break

            try:
                recs = []
                if not self.firstJobTaken:
                    printto(self.stream, self.name + " process commenced a new task ... ")
                    self.firstJobTaken = True
                for record, qsRec in zip(nextTask[0], nextTask[1]):
                    qsRec['queryid'] = record.id
                    recs.append(_matchClosestPrimer(qsRec, record, self.actualQstart, self.trim5end,
                                                    self.trim3end, self.end5offset, self.fr4cut, self.maxPrimer5Length,
                                                    self.maxPrimer3Length, self.primer5sequences,
                                                    self.primer3sequences))
                self.resultsQueue.put(recs)
                self.procCounter.increment(len(recs))
            except Exception as e:
                printto(self.stream, "An error as occurred while processing " + self.name + " with error {}".format(
                    str(e)
                ), LEVEL.EXCEPT)
                self.resultsQueue.put(None)
                continue
        return
Esempio n. 18
0
def _collectPrimerResults(columns, queue, totalTasks, noSeqs, stream=None):
    processed = 0
    cloneAnnot = []
    totalUnexpected5 = totalUnexpected3 = 0
    while totalTasks:
        result = queue.get()
        totalTasks -= 1
        if result is None:
            continue
        for entry, unexpected5, unexpected3 in result:
            totalUnexpected5 += unexpected5
            totalUnexpected3 += unexpected3
            # put them as a list (in the ordering specified by 'columns')
            cloneAnnot.append([entry[col] for col in columns])
        processed = len(cloneAnnot)
        if processed % 50000 == 0:
            printto(
                stream, "\t{:,}/{:,} records have been collected ... ".format(
                    processed, noSeqs))
            sys.stdout.flush()

    printto(
        stream, "\t{:,}/{:,} records have been collected ... ".format(
            processed, noSeqs))
    printto(
        stream,
        "\tThere were {} unexpected 5' alignment and {} unexpected 3' alignment"
        .format(totalUnexpected5, totalUnexpected3), LEVEL.WARN)
    return cloneAnnot
Esempio n. 19
0
def fastq2fasta(fastqFile, outputDir, stream=None):
    """
    Converts a fastq file into fasta file. Fastq can be compressed if it was provided as such
    :param fastqFile: (un)compressed fastq file. If compressed, will leave original compressed untouched
    :param outputDir: Where to produce the new fasta file
    :param stream: debugging stream
    :return: fasta filename
    """
    # FASTQ to FASTA
    # awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' my.fastq > my.fasta
    filename = os.path.basename(fastqFile)
    seqOut = os.path.join(outputDir, "seq")

    if not os.path.isdir(seqOut):
        os.makedirs(seqOut)

    # rename all fastq files to fasta, including gzipped files
    if filename.endswith(".gz"):
        filename = os.path.join(
            seqOut, filename.replace(filename.split('.')[-2] + ".gz", 'fasta'))
        fastqFile = gunzip(fastqFile)
    else:
        filename = os.path.join(
            seqOut, filename.replace(filename.split('.')[-1], 'fasta'))

    if exists(filename):
        printto(stream, "\tThe FASTA file was found!", LEVEL.WARN)
        return filename

    printto(
        stream, "\t" + os.path.basename(fastqFile) +
        " is being converted into FASTA ...")
    SeqIO.convert(fastqFile, 'fastq', filename, 'fasta')

    # not all systems have AWK by default (cough, windows)
    # command = ("awk 'NR % 4 == 1 {sub(\"@\", \"\", $0) ; print \">\" $0} NR % 4 == 2 "
    #            "{print $0}' " + fastqFile + " > " + filename
    #            )
    # os.system(command)
    return filename
Esempio n. 20
0
def alignListOfSeqs(signals, outDir, threads, name, stream=None):
    """
    perform multiple sequence alignment using CLUSTAL

    :param signals:
    :param outDir:
    :param threads:
    :param name:
    :param stream:
    :return:
    """
    L = map(len, signals)

    printto(
        stream,
        "\t\t{} sequences are being aligned using CLUSTAL-OMEGA (L in [{}, {}])... "
        .format(len(L), min(L), max(L)))
    tempSeq = os.path.join(outDir, "csl_temp_seq_" + name + ".fasta")
    tempAlign = tempSeq.replace('.fasta', '.aln')

    seqs = []

    for i in range(len(signals)):
        seqs.append(SeqRecord(Seq(signals[i]), id='seq' + str(i)))
    SeqIO.write(seqs, tempSeq, 'fasta')

    clustal = ShortOpts(CLUSTALOMEGA, i=quote(tempSeq), o=quote(tempAlign))\
        .append("--threads={} --outfmt=clustal".format(threads))

    # printto(stream, "Executing: " + str(clustal))
    # throw away stderr and stdout
    clustal(stdout=None, stderr=None)

    alignment = AlignIO.read(tempAlign, 'clustal')
    alignedSeq = []
    for rec in alignment:
        alignedSeq.append(str(rec.seq))
    os.remove(tempSeq)
    os.remove(tempAlign)
    return alignedSeq
Esempio n. 21
0
def generateMotifs(seqGroups, align, outputPrefix, transSeq=False,
                        extendAlphabet=False, clusterMotifs=False, protein=False, threads=2, stream=None):
    from TAMO.MotifTools import Motif
    ighvMotifs = []
    if clusterMotifs and 'gene' in outputPrefix:
        findMotifClusters(ighvMotifs, outputPrefix, stream=stream)
    printto(stream, '\t\tPWMs, consensus and logos are being generated for {} motifs ... '.format(len(seqGroups)))
    pwmFile = open(outputPrefix + '_pwm.txt', 'w')
    consensusFile = open(outputPrefix + '_consensus.txt', 'w')
    logosFolder = outputPrefix + '_logos'

    if not os.path.exists(logosFolder):
        os.makedirs(logosFolder)

    # create the sequence alphabet: DNA or Protein
    alphabet = createAlphabet(align, transSeq, extendAlphabet, protein)
    groups = seqGroups.keys()
    groups.sort()        
    
    for group in groups:    
        filename = os.path.join(logosFolder, group.replace('/', '') + '.png')
        seqs = seqGroups[group]
        m = generateMotif(seqs, group, alphabet, filename, align, transSeq, protein, outDir=logosFolder,
                          threads=threads, stream=stream)
        if m is None:
            # motif file found, no further work required
            return
        motifSeqs = m.instances
        pwm = m.counts.normalize(pseudocounts=None)  # {'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}
        consensusMax = str(m.consensus)      
               
        pwmFile.write('#{} {} sequences\n'.format(group, len(motifSeqs)))
        pwmFile.write(str(pwm))  
        consensusFile.write('>{} max_count\n'.format(group))
        consensusFile.write(consensusMax + '\n')      
    #             print(str(m.anticonsensus)) # smallest values in the columns
        if not transSeq and not align and not protein:
            consensusIupac = str(m.degenerate_consensus)
    #             print(consensusIupac) # IUPAC ambiguous nucleotides            
            consensusFile.write('>{} degenerate\n'.format(group))
            consensusFile.write(consensusIupac + '\n')
        
        pwmFile.flush()
        consensusFile.flush()
        gc.collect()
        if clusterMotifs and len(motifSeqs) > 10:
            motif = Motif(map(lambda x: str(x), motifSeqs),
                          backgroundD={'A': 0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}, id=group)
            motif.addpseudocounts(0.1)
            ighvMotifs.append(motif)
            
    pwmFile.close()
    consensusFile.close()      
    gc.collect()
    printto(stream, "\tPosition weight matrices are written to " + os.path.basename(outputPrefix + '_pwm.txt'))
    printto(stream, "\tConsensus sequences are written to " + os.path.basename(outputPrefix + '_consensus.txt'))
    if clusterMotifs:
        findMotifClusters(ighvMotifs, outputPrefix, stream=stream)
Esempio n. 22
0
def saveNewickdendrogram(newickClusterFile, tree, stream, title="", logger=None):
    """
    :param newickClusterFile:
    :param tree:  UPGMA object
    :param stream:
    :param title:
    :param logger:
    :return:
    """
    from TAMO.Clustering.UPGMA import create_tree_phylip
    desc = '' if not title else " for {} ".format(title)

    # get phylip newick syntax
    phylipTree = create_tree_phylip(tree)
    with open(newickClusterFile, 'w') as newickfp:
        newickfp.write(phylipTree)

    printto(logger, "Newick dendrogram{}written to ".format(desc) + os.path.basename(newickClusterFile))

    # show ascii art
    phylipTree = Phylo.read(newickClusterFile, format='newick')

    try:
        print("\n\nASCII phylip tree{}:\n".format(desc), file=stream)
        Phylo.draw_ascii(phylipTree, file=stream)
    except ZeroDivisionError:
        # if the weights are 0
        print("\t Not drawn because of 0 weights", file=stream)
        pass

    # plot dendrogram in matplotlib
    phylipTree.ladderize()
    fig, axes = plt.subplots(figsize=(8, 5))
    Phylo.draw(phylipTree, do_show=False, axes=axes, show_confidence=True)
    axes.set_title(title)
    fig.savefig(newickClusterFile.replace('.dnd', '.png'), dpi=300)
    plt.close()
Esempio n. 23
0
def printRefineFlags(flags, records, refineFlagNames, refineFlagMsgs, stream=None):
    # print statistics and a few of the flagged clones
    for f in refineFlagNames:
        if len(flags[f]) > 0:
            printto(stream, refineFlagMsgs[f].format(len(flags[f])), LEVEL.INFO)
            examples = random.choice(range(len(flags[f])), min(3, len(flags[f])), replace=False)
            for i in examples:
                printto(stream, ">" + flags[f][i], LEVEL.INFO)
                printto(stream, str(records[flags[f][i]].seq), LEVEL.INFO)
Esempio n. 24
0
def collectRefineResults(resultsQueue, totalTasks, noSeqs, refineFlagNames, stream=None):
    total = 0
    cloneAnnot = []
    transSeqs = []
    frameworkLengths = defaultdict(_defaultCounter)

    flags = {}
    for f in refineFlagNames:
        flags[f] = []

    while totalTasks:
        result = resultsQueue.get()
        totalTasks -= 1
        if result is None:
            continue

        qsRecsOrdered, seqs, flagsi, recordLengths = result

        # convert dict to Counter object
        for keys, regions in recordLengths.items():
            for region in regions:
                frameworkLengths[keys][region] += Counter(recordLengths[keys][region])

        # update relevant annotation fields
        cloneAnnot += qsRecsOrdered
        transSeqs += seqs

        # update flags 
        for f in refineFlagNames:
            flags[f] += flagsi[f]
        total += len(qsRecsOrdered)

        if total % 50000 == 0:
            printto(stream, '\t{}/{} records have been collected ... '.format(total, noSeqs))

    printto(stream, '\t{}/{} records have been collected ... '.format(total, noSeqs))
    return cloneAnnot, transSeqs, flags, frameworkLengths
Esempio n. 25
0
 def run(self):
     printto(self.stream,
             self.name + " process is now ready to start a new job ...")
     while True:
         nextTask = self.tasksQueue.get()
         if nextTask is None:
             printto(self.stream, self.name + " process has stopped.")
             self.exitQueue.put("exit")
             break
         try:
             if self.simpleScan:
                 self.runSimple(nextTask)
             else:
                 self.runDetailed(nextTask)
         except Exception as e:
             printto(
                 self.stream, "An error occurred while processing " +
                 self.name + " error: {}".format(str(e)), LEVEL.ERR)
             self.resultsQueue.put(None)
             continue
     return
Esempio n. 26
0
def loadRestrictionSites(sitesFile, stream=None):
    """
    given a whitespace separated file containing 2 columns, return a dictionary of restriction enzyme names to
    a regex translated sequence. Ignores all lines that starts with "#"

    :param sitesFile: file with 2 cols, enzyme <ws> seq. Any line that *starts* with # will be ignored
    :param stream: logging stream
    :return: dictionary of enzyme to precompiled regex mapping, for example:
    {
        "ENZYME1": re.compile("AC[GT]..A")     # assuming ENZYME1's IUPAC sequence was "ACKNNA"
    }
    """
    with open(sitesFile) as fp:
        sites = {}
        for line in fp:
            line = line.strip()
            if line and not line.startswith("#"):
                try:
                    enzyme, seq = line.split()
                    if enzyme in sites:
                        printto(
                            stream, enzyme +
                            " is duplicated, the older enzyme sequence {} ".
                            format(sites[enzyme]) + "will be overridden.",
                            LEVEL.WARN)
                    sites[enzyme] = re.compile(
                        replaceIUPACLetters(str(seq).upper().strip()))
                except Exception as e:
                    printto(
                        stream,
                        "Offending line: {}, {}".format(line, line.split()),
                        LEVEL.EXCEPT)
                    raise e

    printto(stream, "Restricting sites have been loaded")
    return sites
Esempio n. 27
0
def extractUpstreamSeqs(cloneAnnot,
                        recordFile,
                        upstream,
                        upstreamFile,
                        stream=None):
    """
    extract the upstream DNA sequences and write them into a FASTA file named upstreamFile
    :param cloneAnnot:
                cloneAnnot DataFrame

    :param recordFile:
                raw record file (string)

    :param upstream:
                list of 2 numbers, denoting [start, end] inclusive in 1-index. np.Inf is also allowed for end value

    :param upstreamFile:
                output FASTA filename

    :param stream:
                logging stream object

    :return:
                None
    """
    printto(stream, "\tExtracting the upstream sequences ... ")

    # alignments with - strand
    revAlign = 0
    # num. seqs with trimmed beginning (vstart > 3)
    trimmedBegin = 0
    # num. seqs with sequences shorter than expected upstream length (len(seq) < expectLength)
    # @see expectLength
    trimmedUpstream = 0
    # excluded sequences because end <= 1
    noSeq = 0
    # num. processed sequences
    procSeqs = 0
    # buffer to hold sequences before flushing into file
    recordsBuffer = []
    # max buffer size allowed
    maxBufferSize = int(10.0**5) / 2

    # expected upstream length = expectLegth (end - start + 1) where start,end are both 1-index
    expectLength = upstream[1] - upstream[0] + 1
    queryIds = cloneAnnot.index

    # NOTE: SeqIO.index can only index string filenames and it has to be unzipped
    _, ext = os.path.splitext(os.path.basename(recordFile.rstrip(os.path.sep)))
    records = SeqIO.index(gunzip(recordFile), ext.lstrip('.'))

    with open(upstreamFile, 'w') as fp:
        for id_ in queryIds:
            record = records[id_]
            qsRec = cloneAnnot.loc[record.id]
            if qsRec.strand != 'forward':
                revAlign += 1
                record.seq = record.seq.reverse_complement()
            if qsRec.vstart <= 3:
                end = qsRec.vqstart - upstream[0] - qsRec.vstart + 1
                if end <= 1:
                    noSeq += 1
                else:
                    start = max(1,
                                qsRec.vqstart - upstream[1] - qsRec.vstart + 1)
                    record.seq = record.seq[int(start - 1):int(end)]
                    if expectLength != Inf and len(record.seq) < expectLength:
                        trimmedUpstream += 1
                    record.id = record.id + _UPSTREAM_SEQ_FILE_SEP + qsRec.vgene
                    record.description = ""
                    recordsBuffer.append(record)
                    procSeqs += 1
                    if procSeqs % maxBufferSize == 0:
                        printto(
                            stream,
                            '{}/{} sequences have been processed ... '.format(
                                procSeqs, len(queryIds)))
                        SeqIO.write(recordsBuffer, fp, 'fasta')
                        recordsBuffer = []
            else:
                trimmedBegin += 1

        # flush remaining sequences
        if len(recordsBuffer) > 0:
            printto(
                stream, '{}/{} sequences have been processed ... '.format(
                    procSeqs, len(queryIds)))
            SeqIO.write(recordsBuffer, fp, 'fasta')

    if revAlign > 0:
        printto(
            stream, "\t\t\t{} sequences are in reversed alignment ... ".format(
                revAlign), LEVEL.INFO)

    if trimmedBegin > 0:
        printto(
            stream,
            "\t\t\tThe query sequence is not aligned within 3bp of the IGV start "
            "position ... {} found and excluded!".format(trimmedBegin),
            LEVEL.WARN)

    if trimmedUpstream > 0:
        printto(
            stream,
            "\t\t\tUpstream sequences shorter than the expected length are detected ... {} found"
            .format(trimmedUpstream), LEVEL.WARN)

    if noSeq > 0:
        printto(
            stream,
            "\t\t\tNo upstream sequence can be extracted (too short) for {} sequences."
            .format(noSeq), LEVEL.WARN)
    gc.collect()
Esempio n. 28
0
def findUpstreamMotifs(upstreamFile,
                       sampleName,
                       outAuxDir,
                       outResDir,
                       expectLength,
                       level,
                       startCodon=True,
                       type='secsig',
                       clusterMotifs=False,
                       threads=2,
                       stream=None):
    """
    finds and visualizes motifs from the sequences provided in upstreamFile

    :param upstreamFile: string
                    path to FASTA file containing upstream sequences

    :param sampleName: string
                    name to refer the sample as

    :param outAuxDir: string
                    path to aux directory

    :param outResDir: string
                    path to result directory

    :param expectLength: tuple or list
                    index-able of length 2 denoting start and end.
                    If start == end, this implies that the analysis should
                    be conducted ONLY on sequences with length == start == end, the rest are ignored.

    :param level: string
                    one of 'gene', 'family' or 'variant'

    :param startCodon: bool
                    whether or not to segregate sequences with start codon

    :param type: string
                    one of upstream analysis types: '5utr' or 'secsig'

    :param clusterMotifs: bool
                    whether or not to cluster sequences using TAMO

    :param threads: int
                    number of threads to use

    :param stream: stream
                    logging stream
    :return: None
    """
    from abseqPy.IgRepAuxiliary.seqUtils import generateMotifs

    if level == 'variant':
        # single argument identity function
        compressor = lambda signals: signals
    elif level == 'gene':
        compressor = compressSeqGeneLevel
    elif level == 'family':
        compressor = compressSeqFamilyLevel
    else:
        raise ValueError(
            "Unknown level {} requested, accepted values are family, gene, or variant"
            .format(level))

    if type not in ['secsig', '5utr']:
        raise ValueError(
            "Unknown parameter type={}, expected one of 'secsig', '5utr'".
            format(type))

    # output files always have this format: <sampleName>_<type>_<exp[0]>_<exp[1]>_*
    OUTPUT_FILE_PACKET = (sampleName, type, expectLength[0], expectLength[1])

    # only analyze motifs of secretion signals that have exactly length == expectLength[0] == expectLength[1]
    EXACT_LENGTH = expectLength[0] == expectLength[1]

    validSeqFile = os.path.join(
        outAuxDir, _VALID_SEQ_FASTA_TEMPLATE.format(*OUTPUT_FILE_PACKET))
    faultySeqFile = os.path.join(
        outAuxDir, _FAULTY_SEQ_FASTA_TEMPLATE.format(*OUTPUT_FILE_PACKET))
    noStartCodonFile = os.path.join(
        outAuxDir, _STARTCOD_SEQ_FASTA_TEMPLATE.format(*OUTPUT_FILE_PACKET))

    allFiles = [validSeqFile, faultySeqFile, noStartCodonFile]

    if all(map(lambda x: os.path.exists(x), allFiles)):
        printto(
            stream,
            "Sequences were already analyzed at {}, loading from files instead ... "
            + ' '.join(allFiles), LEVEL.WARN)

        ighvSignals, faultySeq, noStartCodonSeq = _loadIGVSeqsFromFasta(validSeqFile),\
                                                  _loadIGVSeqsFromFasta(faultySeqFile),\
                                                  _loadIGVSeqsFromFasta(noStartCodonFile)
    else:
        printto(stream, "Sequences are being analyzed ... ")
        ighvSignals, faultySeq, noStartCodonSeq = collectUpstreamSeqs(
            upstreamFile,
            sampleName,
            expectLength,
            outResDir,
            outAuxDir,
            startCodon,
            type,
            stream=stream)

    ighvSignals = compressor(ighvSignals)
    generateMotifs(ighvSignals,
                   align=(expectLength[0] < expectLength[1]),
                   outputPrefix=os.path.join(
                       outResDir, ("{}_{}_{:.0f}_{:.0f}_dna_" +
                                   level).format(*OUTPUT_FILE_PACKET)),
                   clusterMotifs=clusterMotifs,
                   threads=threads,
                   stream=stream)

    if EXACT_LENGTH and type == 'secsig':
        faultySeq = compressor(faultySeq)
        generateMotifs(faultySeq,
                       align=True,
                       outputPrefix=os.path.join(
                           outResDir, ("{}_{}_{:.0f}_{:.0f}_faulty_" +
                                       level).format(*OUTPUT_FILE_PACKET)),
                       transSeq=False,
                       extendAlphabet=True,
                       clusterMotifs=clusterMotifs,
                       threads=threads,
                       stream=stream)
        noStartCodonSeq = compressor(noStartCodonSeq)
        generateMotifs(noStartCodonSeq,
                       align=True,
                       outputPrefix=os.path.join(
                           outResDir, ("{}_{}_{:.0f}_{:.0f}_untranslated_" +
                                       level).format(*OUTPUT_FILE_PACKET)),
                       transSeq=False,
                       extendAlphabet=True,
                       clusterMotifs=clusterMotifs,
                       threads=threads,
                       stream=stream)
        generateMotifs(ighvSignals,
                       align=False,
                       outputPrefix=os.path.join(
                           outResDir, ("{}_{}_{:.0f}_{:.0f}_protein_" +
                                       level).format(*OUTPUT_FILE_PACKET)),
                       transSeq=True,
                       clusterMotifs=clusterMotifs,
                       threads=threads,
                       stream=stream)
Esempio n. 29
0
def collectUpstreamSeqs(upstreamFile,
                        sampleName,
                        expectLength,
                        outResDir,
                        outAuxDir,
                        startCodon=True,
                        type='secsig',
                        plotDist=True,
                        stream=None):
    """
    segregates and plots upstream file sequences. They are segregated as sequences with no start codon,
    faulty sequences (stop codon post translation if type == secsig or X or N nucleotides in the sequence),
    and valid sequences.

    :param upstreamFile: string
                        upstream FASTA file

    :param sampleName: string
                        name of sampel

    :param expectLength: tuple or list
                        index-able of length 2 denoting start and end

    :param outResDir: string
                        name of result output directory

    :param outAuxDir: string
                        name of auxiliary output directory

    :param startCodon: bool
                        whether or not to care about start codons during segregation

    :param type: string
                        either 'secsig' or '5utr'

    :param plotDist: bool
                        whether or not to also save a txt and png file denoting the distribution of segregated sequences

    :param stream: stream
                        debugging stream

    :return: tuple
                        (ighvValidSignals : dict, faultySeqs : dict and noStartCodonSeqs: dict)
    """
    if type not in ['secsig', '5utr']:
        raise ValueError(
            "Unknown parameter type={}, expected one of 'secsig', '5utr'".
            format(type))

    printto(
        stream,
        "\tSequences between {} and {} are being extracted ... ".format(
            expectLength[0], expectLength[1]))

    START_CODON = "ATG"

    # valid sequences
    ighvSignals = defaultdict(list)
    ighvSignalsCounts = defaultdict(int)

    # no start codons
    ighvSignalsNoATG = defaultdict(list)
    noStartCodonCounts = defaultdict(int)

    # faulty translations
    faultyTrans = defaultdict(list)
    faultyTransCounts = defaultdict(int)

    ignoredSeqs = 0

    records = SeqIO.index(gunzip(upstreamFile), 'fasta')
    for id_ in records:
        rec = records[id_]
        ighv = rec.id.split(_UPSTREAM_SEQ_FILE_SEP)[1]
        seq = rec.seq
        if expectLength[0] <= len(rec) <= expectLength[1]:
            if not startCodon or START_CODON in seq:

                if type == 'secsig':
                    seq = seq[:len(seq) -
                              (len(seq) % 3)].translate(to_stop=False)[1:]

                if 'X' in seq or '*' in seq:
                    faultyTrans[ighv].append(rec)
                    faultyTransCounts[ighv] += 1
                elif 'N' not in rec.seq:
                    ighvSignals[ighv].append(rec)
                    ighvSignalsCounts[ighv] += 1
                else:
                    printto(stream,
                            "Ignored: " + str(rec.seq) + ' ' + str(seq))
                    if type == 'secsig':
                        faultyTrans[ighv].append(rec)
                        faultyTransCounts[ighv] += 1
            elif startCodon:
                # START_CODON not in seq
                ighvSignalsNoATG[ighv].append(rec)
                noStartCodonCounts[ighv] += 1
        else:
            ignoredSeqs += 1

    if ignoredSeqs:
        printto(
            stream,
            "\tThere are {} sequences that were ignored because the length of the provided upstream"
            "sequences were not {} <= length(upstream_seqs) <= {}".format(
                ignoredSeqs, *expectLength), LEVEL.WARN)

    if sum(ighvSignalsCounts.values()):
        flattenRecs = list(itertools.chain.from_iterable(ighvSignals.values()))
        assert len(flattenRecs) == sum(ighvSignalsCounts.values())
        title = 'Valid Secretion Signals' if type == 'secsig' else "Valid 5'-UTRs"
        printto(
            stream, "\tThere are {} {} within expected "
            "length ({} to {}) and startCodon={}".format(
                sum(ighvSignalsCounts.values()), title, expectLength[0],
                expectLength[1], startCodon), LEVEL.INFO)
        validSeqFile = os.path.join(
            outAuxDir,
            _VALID_SEQ_FASTA_TEMPLATE.format(sampleName, type, *expectLength))
        SeqIO.write(flattenRecs, validSeqFile, 'fasta')
        if plotDist:
            writeCountsCategoriesToFile(
                ighvSignalsCounts, sampleName,
                os.path.join(
                    outResDir, "{}_{}_{:.0f}_{:.0f}_valid_".format(
                        sampleName, type, expectLength[0], expectLength[1])),
                title)
    if sum(faultyTransCounts.values()):
        flattenRecs = (itertools.chain.from_iterable(faultyTrans.values()))
        assert len(flattenRecs) == sum(faultyTransCounts.values())
        faultySeqFile = os.path.join(
            outAuxDir,
            _FAULTY_SEQ_FASTA_TEMPLATE.format(sampleName, type, *expectLength))
        SeqIO.write(flattenRecs, faultySeqFile, 'fasta')
        if plotDist:
            writeCountsCategoriesToFile(
                faultyTransCounts, sampleName,
                os.path.join(
                    outResDir, "{}_{}_{:.0f}_{:.0f}_faulty_".format(
                        sampleName, type, *expectLength)),
                'Faulty Translations')
        printto(
            stream, "\tTotal faulty secretion signals is {} (excluded)".format(
                len(flattenRecs)), LEVEL.INFO)
        for i in random.choice(range(len(flattenRecs)),
                               min(5, len(flattenRecs)),
                               replace=False):
            sequence = flattenRecs[i].seq
            printto(
                stream, "\t{}\n\tTranslated:{}".format(
                    sequence, sequence[:len(sequence) -
                                       (len(sequence) % 3)].translate()))

    if sum(noStartCodonCounts.values()):
        flattenRecs = list(
            itertools.chain.from_iterable(ighvSignalsNoATG.values()))
        assert len(flattenRecs) == sum(noStartCodonCounts.values())
        noStartCodonFile = os.path.join(
            outAuxDir,
            _STARTCOD_SEQ_FASTA_TEMPLATE.format(sampleName, type,
                                                *expectLength))
        SeqIO.write(flattenRecs, noStartCodonFile, 'fasta')
        if plotDist:
            writeCountsCategoriesToFile(
                noStartCodonCounts, sampleName,
                os.path.join(
                    outResDir, "{}_{}_{:.0f}_{:.0f}_no_atg_".format(
                        sampleName, type, *expectLength)),
                "Upstream sequences without start codon")
        printto(
            stream,
            "\tThere is no ATG codon in {} sequences (excluded)".format(
                len(flattenRecs)), LEVEL.INFO)
        for i in random.choice(range(len(flattenRecs)),
                               min(5, len(flattenRecs)),
                               replace=False):
            printto(stream, "\t{}".format(flattenRecs[i].seq))

    # the output of each ighv key's value should be a list of strings, not SeqRecord object
    for k in ighvSignals:
        ighvSignals[k] = map(lambda x: str(x.seq), ighvSignals[k])
    for k in faultyTrans:
        faultyTrans[k] = map(lambda x: str(x.seq), faultyTrans[k])
    for k in ighvSignalsNoATG:
        ighvSignalsNoATG[k] = map(lambda x: str(x.seq), ighvSignalsNoATG[k])

    return ighvSignals, faultyTrans, ighvSignalsNoATG
Esempio n. 30
0
def extractCDRInfo(blastOutput, chain, stream=None):
    # Extract the top hits
    printto(stream,
            '\tExtracting top hit tables ... ' + os.path.basename(blastOutput))
    # process igblast output and extract top hit
    cloneAnnot = []
    filteredIDs = []
    line = ""

    warning = False
    # RE: parsing IGBLAST:
    # VDJ junction details MAY give N/A instead of just missing:
    # eg:
    # V-(D)-J junction details based on top germline gene matches
    # (V end, V-D junction, D region, D-J junction, J start).  Note that possible
    # overlapping nucleotides at VDJ junction (i.e, nucleotides that could
    # be assigned to either rearranging gene) are indicated in parentheses (i.e., (TACT)) but are
    # not included under the V, D, or J gene itself
    # GGGTC  TGTTCACGAGGGCATCTGTGTCCTGTTTTTAGGTTCTCCTCCC  TTTTGAC  N/A  N/A

    # it also has a variable number of hits (depending on presence of region)
    # EG:
    # V-(D)-J junction details based on top germline gene matches (V end, V-J junction, J start).  Note that possible overlapping nucleotides at VDJ junction (i.e, nucleotides that could be assigned to either rearranging gene) are indicated in parentheses (i.e., (TACT)) but are not included under the V, D, or J gene itself
    # CCTCT  N/A  GGTGT

    with open(blastOutput) as blast:
        while True:
            try:
                if not line.startswith('# Query'):
                    line = blast.readline()
                    if not line:
                        break
                    continue
                cloneRecord = createCloneRecord(chain)
                cloneRecord['queryid'] = line.split()[2].strip()
                # parse  V-(D)-J rearrangement
                line = blast.readline()
                while (line and not line.startswith('# Query')
                       and not line.startswith('# V-(D)-J rearrangement')):
                    line = blast.readline()
                if not line:
                    filteredIDs.append(cloneRecord['queryid'])
                    break
                if line.startswith('# Query'):
                    filteredIDs.append(cloneRecord['queryid'])
                    continue
                line = blast.readline().strip().split('\t')
                cloneRecord[
                    'strand'] = 'forward' if line[-1] == '+' else 'reversed'
                #                 print line, cloneRecord['strand']
                #                 sys.exit()

                # XXX: the or len(line) == 8 may happen to light chains too, when there is
                # a rogue D-gene that was a hit. It then follows heavy chain's indexing
                if (chain == 'hv') or len(line) == 8:
                    cloneRecord['stopcodon'] = line[4]
                    cloneRecord['v-jframe'] = line[5]
                    cloneRecord['vgene'] = line[0].split(',')[0]
                    cloneRecord['dgene'] = line[1].split(',')[0]
                    cloneRecord['jgene'] = line[2].split(',')[0]
                    cloneRecord['chain'] = line[3]
                else:
                    cloneRecord['stopcodon'] = line[3]
                    cloneRecord['v-jframe'] = line[4]
                    cloneRecord['vgene'] = line[0].split(',')[0]
                    cloneRecord['jgene'] = line[1].split(',')[0]
                    cloneRecord['chain'] = line[2]
                line = ' '.join(line)

                # Parse Sub-region analysis and ignore it if there's no CDR3 hit by IGBLAST
                while line and \
                        not line.startswith("# Alignment") and \
                        not line.startswith("# Sub-region") and \
                        not line.startswith("# Query"):
                    line = blast.readline()

                # EOF
                if not line:
                    filteredIDs.append(cloneRecord['queryid'])
                    break

                # there's no # Sub-region, nor is there # Alignment.
                if line.startswith("# Query"):
                    filteredIDs.append(cloneRecord['queryid'])
                    continue

                # this implies that IGBLAST successfully classified a CDR3 sequence
                if line.startswith("# Sub-region"):
                    line = blast.readline()
                    subregionData = line.split()
                    assert subregionData[0] == 'CDR3'
                    if len(subregionData) >= 3 and subregionData[-1].isdigit(
                    ) and subregionData[-2].isdigit():
                        cloneRecord['cdr3.start'] = to_int(subregionData[-2])
                        cloneRecord['cdr3.end'] = to_int(subregionData[-1])
                        # true FR3 end is at position cdr3.start - 1
                        # (the alignment table only tells us the FR3 germline)
                        # but since fr3.start always begins in the germline, there's no special field for that
                        # and is assumed that fr3.start == fr3g.start
                        cloneRecord['fr3.end'] = cloneRecord['cdr3.start'] - 1

                # parse Alignment Summary between query and top germline V gene
                while (line and not line.startswith('# Query')
                       and not line.startswith("# Alignment")):
                    line = blast.readline()
                if not line:
                    filteredIDs.append(cloneRecord['queryid'])
                    break
                if line.startswith('# Query'):
                    filteredIDs.append(cloneRecord['queryid'])
                    continue
                line = blast.readline()
                for i in range(1, 4):
                    if line.lower().startswith('fr' + str(i)):
                        line = line.split()
                        cloneRecord['fr%d.start' % i] = to_int(line[1])
                        cloneRecord['fr%d%s.end' %
                                    (i, 'g' if i == 3 else '')] = to_int(
                                        line[2])
                        cloneRecord['fr%d%s.mismatches' %
                                    (i, 'g' if i == 3 else '')] = to_int(
                                        line[5])
                        cloneRecord['fr%d%s.gaps' %
                                    (i, 'g' if i == 3 else '')] = to_int(
                                        line[6])
                        line = blast.readline()
                    if line.lower().startswith('cdr' + str(i)):
                        # IgBLAST has parenthesis beside CDR3 germline start/end, depending on the domain system
                        # if domain_system == imgt, (germline)
                        # if domain_system == kabat, (V gene only)
                        line = line.replace('(germline)',
                                            '').replace('(V gene only)',
                                                        '').split()

                        cloneRecord['cdr%d%s.start' %
                                    (i, 'g' if i == 3 else '')] = to_int(
                                        line[1])
                        cloneRecord['cdr%d%s.end' %
                                    (i, 'g' if i == 3 else '')] = to_int(
                                        line[2])
                        cloneRecord['cdr%d%s.mismatches' %
                                    (i, 'g' if i == 3 else '')] = to_int(
                                        line[5])
                        cloneRecord['cdr%d%s.gaps' %
                                    (i, 'g' if i == 3 else '')] = to_int(
                                        line[6])
                        line = blast.readline()

                # if the CDR3 region wasn't identified by IgBlast, we can't get FR3 end, so we fallback to
                # FR3 germline end. Since CDR3.start and CDR3.end isn't really used unitl the refinement process,
                # we ignore the fallback options for them.
                if np.isnan(cloneRecord['fr3.end']):
                    cloneRecord['fr3.end'] = cloneRecord['fr3g.end']

                # parse alignment information between query and V, D and J genes
                while (line and not line.startswith('# Query')
                       and not line.startswith("# Fields")):
                    line = blast.readline()
                if not line:
                    filteredIDs.append(cloneRecord['queryid'])
                    break
                if line.startswith('# Query'):
                    filteredIDs.append(cloneRecord['queryid'])
                    continue
                line = blast.readline()
                noHits = to_int(line.split()[1])
                if noHits == 0:
                    filteredIDs.append(cloneRecord['queryid'])
                    continue
                    # retrieve the top hit
                # parse the top V gene info
                line = blast.readline()
                if not line.startswith("V"):
                    filteredIDs.append(cloneRecord['queryid'])
                    continue
                hit = line.split()
                score = float(hit[-1])
                align = to_int(hit[4])
                sStart = to_int(hit[10])
                cloneRecord['identity'] = float(hit[3])
                cloneRecord['alignlen'] = align
                cloneRecord['bitscore'] = score
                cloneRecord['vqstart'] = to_int(hit[8])
                cloneRecord['vstart'] = sStart
                cloneRecord['vmismatches'] = to_int(hit[5])
                cloneRecord['vgaps'] = to_int(hit[7])
                # parse the top D gene info
                line = blast.readline()
                while (line and not line.startswith("# Query")
                       and not line.startswith("D")
                       and not line.startswith("J")):
                    line = blast.readline()
                if not line:
                    cloneAnnot.append(
                        convertCloneRecordToOrderedList(cloneRecord, chain))
                    break
                if line.startswith('# Query'):
                    cloneAnnot.append(
                        convertCloneRecordToOrderedList(cloneRecord, chain))
                    continue
                if line.startswith("D"):
                    hit = line.split()
                    cloneRecord['dqstart'] = to_int(hit[8])
                    cloneRecord['dqend'] = to_int(hit[9])
                    cloneRecord['dstart'] = to_int(hit[10])
                    cloneRecord['dmismatches'] = to_int(hit[5])
                    cloneRecord['dgaps'] = to_int(hit[7])
                # parse the top J gene info
                while (line and not line.startswith("# Query")
                       and not line.startswith("J")):
                    line = blast.readline()
                if not line:
                    cloneAnnot.append(
                        convertCloneRecordToOrderedList(cloneRecord, chain))
                    break
                if line.startswith('# Query'):
                    cloneAnnot.append(
                        convertCloneRecordToOrderedList(cloneRecord, chain))
                    continue
                if line.startswith("J"):
                    hit = line.split()
                    cloneRecord['jqstart'] = to_int(hit[8])
                    cloneRecord['jqend'] = to_int(hit[9])
                    cloneRecord['jstart'] = to_int(hit[10])
                    # jend is a little special, we need it for FR4 end deduction
                    cloneRecord['jend'] = to_int(hit[11])
                    cloneRecord['jmismatches'] = to_int(hit[5])
                    cloneRecord['jgaps'] = to_int(hit[7])
                cloneAnnot.append(
                    convertCloneRecordToOrderedList(cloneRecord, chain))
            except Exception:
                warning = True
                continue
    if len(cloneAnnot) > 0:
        # productive = no stop and in-frame
        # v-jframe: in-frame, out-of-frame, N/A (no J gene)
        # stopcodon: yes, no
        cloneAnnot = DataFrame(cloneAnnot, columns=getAnnotationFields(chain))
        cloneAnnot.set_index('queryid', drop=True, inplace=True)
    else:
        cloneAnnot = DataFrame()
    if warning:
        printto(
            stream, "WARNING: something went wrong while parsing {}".format(
                blastOutput), LEVEL.WARN)
    return cloneAnnot, filteredIDs