Example #1
0
def run(cfg):
    # get params
    print("tvc: start...")
    readSet = cfg.readSet
    uBam = cfg.uBam
    numCpus = cfg.numCores
    samtoolsMem = cfg.samtoolsMem
    samtoolsDir = cfg.samtoolsDir
    vcflibDir = cfg.vcflibDir
    flowTagsNeeded = set(("ZP","ZA","ZG","ZB","ZC","ZM","ZF","RG"))

    # open raw read file *.basecaller.bam
    bam = pysam.AlignmentFile(uBam, "rb", check_sq = False)

    # save BAM header in a string for later
    bamHeaderRaw = bam.text

    # open output file, and cutadapt 3' trim file
    fileout = open(readSet + ".tvc.flowtags.txt","w")
    fileIn3 = open(readSet + ".cutadapt.3.R1.txt","r")

    # create a dict of read id -> tag
    umi_dict = {}
    tag_name = "mi"
    with open(readSet +  ".umi.tag.txt","r") as IN:
        for line in IN:
            read_id, umi, umi_qual = line.strip('\n').split('\t')
            umi_dict[read_id] = umi

    print "\nDone creating readID -> UMI  dict\n"

    # merge 5' trim, 3' trim, and raw read flow tags
    for line in open(readSet + ".cutadapt.5.R1.txt","r"):
        vals5 = line.strip().split("\t")
        readId = vals5[0]
        umi = umi_dict["@"+readId]
        # spin the 3' trim file, and the raw read file forward
        while True:
            line = fileIn3.readline()
            vals3 = line.strip().split("\t")
            readId3 = vals3[0]
            read = bam.next()
            if read.query_name != readId3:
                raise Exception("3' cutadapt trim info file not in same order as raw read file")
            if readId3 == readId:
                break

        # debug check
        if readId3 != readId:
            raise Exception("3' and 5' cutadapt trim files out of sync")

        # skip to next read if 5' adapter not found
        if int(vals5[1]) == -1:
            continue

        # debug check
        if int(vals3[1]) == -1:
            raise Exception("trim of both 5' and 3' adapters expected")

        # fix line strip
        while len(vals3) < 11:
            vals3.append("")
        while len(vals5) < 11:
            vals5.append("")

        # avoid hassles of pysam type conversions for optional tags, by using SAM text format (might be slow)
        samVals = read.tostring(bam).split("\t")

        # get barcode region quality from raw read
        umiC = samVals[ 9][0:12]
        umiQ = samVals[10][0:12]
        if umiC != umi:
            raise Exception("unexpected barcode sync")

        # get trimmed sequences
        seq3  = vals3[5] + vals3[6]             # 6 should be empty string
        seq5  = umi  + vals5[4] + vals5[5]  # 4 should be empty string

        # get trimmed qual vals
        qual3 = vals3[9] + vals3[10]           # 10 should be empty string
        qual5 = umiQ + vals5[8] + vals5[9] #  8 should be empty string

        # debug check on read length
        readLenRaw = len(samVals[9])
        readLenTrm = len(seq5) + len(vals5[6]) + len(seq3)
        if readLenTrm != readLenRaw:
            raise Exception("Length of trimmed read not equal to raw read!")

        # get raw read flow signal tags
        outvec = [readId, seq5, qual5, seq3, qual3]
        for tag in samVals[11:]:
            if tag[0:2] in flowTagsNeeded:
                outvec.append(tag)

        # write to disk
        fileout.write("|".join(outvec))
        fileout.write("\n")

    # done
    fileout.close()
    fileIn3.close()
    bam.close()

    # sort the trimmed seq / flow tag file by read id
    cmd = "sort -k1,1 -t\| --parallel={0} {1}.tvc.flowtags.txt > {1}.tvc.flowtags.sorted.txt".format(numCpus,readSet)
    subprocess.check_call(cmd, shell=True)
    os.remove("{}.tvc.flowtags.txt".format(readSet))

    # sort oligoClip file by read id
    cmd = samtoolsDir + "samtools sort -n -m " + samtoolsMem + " -@" + numCpus \
    + " -T " + readSet \
    + " -o " + readSet + ".tvc.temp.bam " \
             + readSet + ".bam " \
    + " > "  + readSet + ".tvc.sort.log 2>&1 "
    subprocess.check_call(cmd, shell=True)

    # set up reverse comlement
    dnaComplementTranslation = string.maketrans("ATGC", "TACG")

    # open readId-sorted main BAM file, build header for TVC output bam
    bamIn = pysam.AlignmentFile(readSet + ".tvc.temp.bam", "rb")

    # dump bam header to sam file, because we are using older version of pysam that cannot directly take text lines for header
    headerTagsNeeded = set(["CO", "RG", "PG"])
    fileOut = open(readSet + ".tvc.header.sam", "w")
    for line in bamIn.text.split("\n"): # init with TMAP tags
        if len(line) > 3 and line[1:3] != "RG":
            fileOut.write(line)
            fileOut.write("\n")
    for line in bamHeaderRaw.split("\n"): # add Ion BaseCaller flow tags
        if line[1:3] in headerTagsNeeded:
            fileOut.write(line)
            fileOut.write("\n")
    fileOut.close()
    samHeaderOnly = pysam.AlignmentFile(readSet + ".tvc.header.sam", "r")

    # open output BAM file
    bamOut = pysam.AlignmentFile(readSet + ".tvc.bam", "wb", template=samHeaderOnly)
    samHeaderOnly.close()
    os.remove(readSet + ".tvc.header.sam")

    # make TVC input file - add hard clipped regions back as soft-clipped alignments
    fileIn = open(readSet + ".tvc.flowtags.sorted.txt","r")
    for read in bamIn:

        # drop fake R2 (primer side)
        if not read.is_read2:
            continue

        # change back to single end
        read.is_read1 = False
        read.is_read2 = False
        read.is_paired = False
        read.mate_is_reverse = False
        read.mate_is_unmapped = False

        # parse readId
        vals = read.query_name.split(":")
        #readIdBam = ":".join(vals[0:-2])
        readIdBam = read.query_name

        # spin the flowtag file forward (not all reads in the bam)
        readId = None
        while True:
            line = fileIn.readline()
            vals = line.strip().split("|")
            (readId, seq5, qual5, seq3, qual3) = vals[0:5]
            if readId == readIdBam:
                break

        # debug check
        if readId == None:
            raise Exception("missing read id in TVC flowtag merge")

        # handle negative strand alignment
        if read.is_reverse:
            tmp  = seq5
            seq5 = seq3
            seq3 = tmp
            seq5 = seq5[::-1]
            seq5 = seq5.translate(dnaComplementTranslation)
            seq3 = seq3[::-1]
            seq3 = seq3.translate(dnaComplementTranslation)
            tmp   = qual5[::-1]
            qual5 = qual3[::-1]
            qual3 = tmp

        # copy the cigar
        cigar = list(read.cigar)

        # add 5' trim back on
        (op, bases) = cigar[0]
        if op == 4:
            cigar[0] = (op, bases + len(seq5))
        else:
            cigar.insert(0,(4, len(seq5)))

        # add 3' trim back on
        (op, bases) = cigar[-1]
        if op == 4:
            cigar[-1] = (op, bases + len(seq3))
        else:
            cigar.append((4, len(seq3)))

        # save cigar edits
        read.cigar = cigar

        # pysam requires saving qual values first
        qual = read.qual

        # fix up the seq
        read.query_sequence = seq5 + read.query_sequence + seq3

        # fix up the quality
        read.qual = qual5 + qual + qual3

        # add flow quality tags
        for tag in vals[5:]:
            (tagName,tagType,tagVal) = tag.split(":")
            if tagType == "Z":
                pass
            elif tagType == "i":
                tagVal = int(tagVal)
            elif tagType == "B":
                if tagVal.startswith("f,"):
                    tagVal = array.array("f",[float(x) for x in tagVal[2:].split(",")])
                elif tagVal.startswith("i,"):
                    tagVal = array.array("i",[int(x)   for x in tagVal[2:].split(",")])
                elif tagVal.startswith("s,"):
                    tagVal = array.array("h",[int(x)   for x in tagVal[2:].split(",")])
                else:
                    raise Exception()
            else:
                raise Exception()
            read.set_tag(tagName,tagVal)

        # output modified read
        bamOut.write(read)

    # done
    fileIn.close()
    bamIn.close()
    bamOut.close()
    os.remove(readSet + ".tvc.temp.bam")

    # sort final TVC input bam
    cmd = samtoolsDir + "samtools sort -m " + samtoolsMem + " -@" + numCpus \
    + " -T " + readSet \
    + " -o " + readSet + ".tvc.sorted.bam " \
             + readSet + ".tvc.bam " \
    + " > "  + readSet + ".tvc.sort.log 2>&1 "
    subprocess.check_call(cmd, shell=True)

    # index final TVC input bam
    cmd = samtoolsDir + "samtools index " + readSet + ".tvc.sorted.bam"
    subprocess.check_call(cmd, shell=True)

    # run TVC
    roiBedFile = cfg.roiBedFile
    torrentBinDir     = cfg.torrentBinDir
    torrentGenomeFile = cfg.genomeFile
    torrentVcfFile = readSet + ".tvc.vcf"
    cmd = os.path.join(torrentBinDir , "tvc") + " --output-dir _TVC_ " \
     + " -n " + numCpus \
     + " -b " + readSet + ".tvc.sorted.bam" \
     + " -t " + roiBedFile \
     + " -r " + torrentGenomeFile \
     + " -o " + torrentVcfFile \
     + " --snp-min-allele-freq 0.005" \
     + " --snp-min-cov-each-strand 0 " \
     + " --snp-min-coverage 3" \
     + " --snp-min-var-coverage 2" \
     + " --snp-min-variant-score 6" \
     + " --snp-strand-bias 1" \
     + " --snp-strand-bias-pval 0" \
     + " --mnp-min-allele-freq 0.005" \
     + " --mnp-min-cov-each-strand 0" \
     + " --mnp-min-coverage 3" \
     + " --mnp-min-var-coverage 2" \
     + " --mnp-min-variant-score 6" \
     + " --mnp-strand-bias 1" \
     + " --mnp-strand-bias-pval 0" \
     + " --indel-min-allele-freq 0.05" \
     + " --indel-min-cov-each-strand 0" \
     + " --indel-min-coverage 3" \
     + " --indel-min-var-coverage 2" \
     + " --indel-min-variant-score 10" \
     + " --indel-strand-bias 1" \
     + " --indel-strand-bias-pval 0" \
     + " > " + readSet + ".tvc.log 2>&1"
    print("tvc: command line is " + cmd)
    subprocess.check_call(cmd, shell=True)
    print("tvc: done running TVC")

    # move TVC VCF to current directory
    os.rename("_TVC_/" + torrentVcfFile, torrentVcfFile)

    # call up vcflib command to split multi-allelic, remove GT tag, get primitives
    cmd = "{0}vcfbreakmulti {1}.tvc.vcf | " \
        + "{0}vcfkeepgeno - DP AF AD VF | " \
        + "{0}vcfallelicprimitives --tag-parsed AP > {1}.tvc.primitives.vcf 2> {1}.tvc.vcflib.log"
    cmd = cmd.format(vcflibDir,readSet)
    subprocess.check_call(cmd,shell=True)

    # (1) drop TVC primitive variants that have allele fraction below 0.05
    # (2) make BED file for smCounter - regions +/- 10 bp from a TVC primitive variant
    bedTvc = []
    fileout   = open(readSet + ".tvc.primitives.temp.vcf", "w")
    for line in open(readSet + ".tvc.primitives.vcf", "r"):

        # echo VCF header
        if line.startswith("#"):
            fileout.write(line)
            continue

        # parse line
        chrom, pos, id, ref, alt, qual,filter,info,format,sampleId = line.strip().split("\t")

        # make sure data is as expected
        if alt.find(",") >= 0:
            raise Exception("tvc: not expecting multi-allelic variant in primitives file")

        # get left location, zero-based
        locL = int(pos) - 1

        # look for indel, include right flanking base
        altLen = len(alt)
        refLen = len(ref)
        if altLen == refLen:  # SNP or MNP
            locR = locL + refLen
            isIndel = False
        else:  # INDEL
            locR = locL + refLen + 1
            isIndel = True

        # get AF tag - assume always present - exception will be thrown if None remains
        alleleFraction = None
        for tag in info.split(";"):
            if tag.find("=") > 0:
                tagName, tagVal = tag.split("=")
                if tagName == "AF":
                    if tagVal.find(",") >= 0:
                        raise Exception("tvc: not expecting TVC primitives to be multi-allelic")
                    alleleFraction = float(tagVal)

        # drop TVC primitive variants with low INDEL allele fraction
        if (isIndel and alleleFraction < 0.05) or alleleFraction < 0.005:
            continue

        # echo line to new TVC VCF primitives file
        fileout.write(line)

        # save region, with 10 bp flanking
        locL = max(0,locL - 10)
        locR += 10
        if chrom == "chrM" and locR > 16569:  # horrific hack for chrM NC_012920 reference
            locR = 16569
        if locL < locR:
            bedTvc.append((chrom,locL, locR))

    # close filtered TVC VCF primitives file, rename for later use
    fileout.close()
    os.rename(readSet + ".tvc.primitives.temp.vcf", readSet + ".tvc.primitives.vcf")

    # merge BED and write to disk
    bedTvc = bed.merge(bedTvc)
    bed.write(bedTvc, readSet + ".tvc_roi.bed")
    print("tvc: done running TVC and making smCounter ROI bed")
Example #2
0
def geneCov(gene, genePrimers, fragLen):
    '''
    get gene coverage using exon models and a max fragment length
    based on step01.py by John Dicarlo
    '''

    # intit bed coverage
    bedCovOneGene = []
    bedTrackSet = set()
    bedWarnings = []

    # loop over RNAs
    for rnaId in gene:
        rnaLen = 0
        bedExons = []

        # get exons
        firstExon = True
        for (geneName, strand, chrom, exonStart, exonEnd) in gene[rnaId]:
            exonStart = int(exonStart)
            exonEnd = int(exonEnd)

            if firstExon:
                geneLocL = exonStart
                geneLocR = exonEnd
                firstExon = False
            else:
                geneLocL = min(exonStart, geneLocL)
                geneLocR = max(exonEnd, geneLocR)

            rnaLen += exonEnd - exonStart
            bedExons.append((chrom, exonStart, exonEnd))

        bedExons.sort()

        # init coverage for this RNA
        bedCovOneRna = []

        # loop over primers, make RNA coverage BED tracks
        for (chrom, locDna5, locDna3, strand, primer) in genePrimers:
            locDna5 = int(locDna5)
            locDna3 = int(locDna3)
            strand = int(strand)

            # get primer RNA loc3
            exonsLen = 0
            locRna3 = None
            for (chrom, locL, locR) in bedExons:
                if locL <= locDna3 < locR:
                    locRna3 = exonsLen + locDna3 - locL
                    break
                exonsLen += (locR - locL)
            # check if primer match RNA
            if locRna3 == None:
                continue

            # get loc5 on RNA
            primerLen = len(primer)
            if strand == 0:
                locRna5 = locRna3 - primerLen + 1
            else:
                locRna5 = locRna3 + primerLen - 1

            # get DNA position of end of fragment
            if strand == 0:
                locRnaEnd = min(locRna5 + fragLen - 1, rnaLen - 1)
            else:
                locRnaEnd = max(locRna5 - fragLen + 1, 0)
            locL_ = 0
            locR_ = 0
            locDnaEnd = None
            for (chrom, locL, locR) in bedExons:
                locR_ += (locR - locL)
                if locL_ <= locRnaEnd < locR_:
                    locDnaEnd = locL + locRnaEnd - locL_
                    break
                locL_ = locR_
            if locDnaEnd == None:
                raise Exception()

            # frag coverage region
            if strand == 0:
                bedDelete = [(chrom, geneLocL, locDna3 + 1),
                             (chrom, locDnaEnd + 1, geneLocR)]
            else:
                bedDelete = [(chrom, geneLocL, locDnaEnd),
                             (chrom, locDna3, geneLocR)]
            bedCov = bed.subtract(bedExons, bedDelete)

            # save coverage across whole RNA
            bedCovOneRna.extend(bedCov)

            # make subtraction bed for full frag, including primer
            if strand == 0:
                bedDelete = [(chrom, geneLocL, locDna5),
                             (chrom, locDnaEnd + 1, geneLocR)]
            else:
                bedDelete = [(chrom, geneLocL, locDnaEnd),
                             (chrom, locDna5 + 1, geneLocR)]

            # do bed subtraction to get enrichment frag
            bedFrag = bed.subtract(bedExons, bedDelete)
            bedFrag = bed.merge(bedFrag)  # should not do anything

            # get size of enrichment frag (might be less than fragLen at ends of RNA)
            bpFrag = sum((x[2] - x[1] for x in bedFrag))

            # convert bedFrag to a one-row bed
            bedLocL = bedFrag[0][1]
            bedLocR = bedFrag[-1][2]
            if strand == 0:
                bedStrand = "+"
                bedThickStart = locDna3 + 1
                bedThickStop = bedLocR
            else:
                bedStrand = "-"
                bedThickStart = bedLocL
                bedThickStop = locDna3
            if bedThickStart >= bedThickStop:
                bedWarnings.append(
                    (chrom, locDna5, locDna3, strand, primer, geneName, rnaId,
                     bedThickStart, bedThickStop))
            numBlocks = len(bedFrag)
            blockSizes = ",".join([str(x[2] - x[1]) for x in bedFrag])
            blockStarts = ",".join([str(x[1] - bedLocL) for x in bedFrag])
            bedScore = 0
            bedOne = (chrom, bedLocL, bedLocR, geneName, bedScore, bedStrand,
                      bedThickStart, bedThickStop, 0, numBlocks, blockSizes,
                      blockStarts)
            # bedOne = (chrom, bedLocL, bedLocR, bpFrag, bedScore, bedStrand, bedThickStart, bedThickStop, 0, numBlocks, blockSizes, blockStarts)
            bedTrackSet.add(bedOne)

        # update BED for all RNAs coverage
        bedCovOneGene.extend(bedCovOneRna)

    # post processing
    bedCovOneGene.sort()
    bedTrackSet = list(bedTrackSet)
    bedTrackSet.sort()

    return (bedCovOneGene, bedTrackSet, bedWarnings)