Python merge Examples

Programming Language: Python

Namespace/Package Name: bed

Method/Function: merge

Examples at hotexamples.com: 2

Python merge - 2 examples found. These are the top rated real world Python examples of bed.merge extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def run(cfg):
    # get params
    print("tvc: start...")
    readSet = cfg.readSet
    uBam = cfg.uBam
    numCpus = cfg.numCores
    samtoolsMem = cfg.samtoolsMem
    samtoolsDir = cfg.samtoolsDir
    vcflibDir = cfg.vcflibDir
    flowTagsNeeded = set(("ZP","ZA","ZG","ZB","ZC","ZM","ZF","RG"))

    # open raw read file *.basecaller.bam
    bam = pysam.AlignmentFile(uBam, "rb", check_sq = False)

    # save BAM header in a string for later
    bamHeaderRaw = bam.text

    # open output file, and cutadapt 3' trim file
    fileout = open(readSet + ".tvc.flowtags.txt","w")
    fileIn3 = open(readSet + ".cutadapt.3.R1.txt","r")

    # create a dict of read id -> tag
    umi_dict = {}
    tag_name = "mi"
    with open(readSet +  ".umi.tag.txt","r") as IN:
        for line in IN:
            read_id, umi, umi_qual = line.strip('\n').split('\t')
            umi_dict[read_id] = umi

    print "\nDone creating readID -> UMI  dict\n"

    # merge 5' trim, 3' trim, and raw read flow tags
    for line in open(readSet + ".cutadapt.5.R1.txt","r"):
        vals5 = line.strip().split("\t")
        readId = vals5[0]
        umi = umi_dict["@"+readId]
        # spin the 3' trim file, and the raw read file forward
        while True:
            line = fileIn3.readline()
            vals3 = line.strip().split("\t")
            readId3 = vals3[0]
            read = bam.next()
            if read.query_name != readId3:
                raise Exception("3' cutadapt trim info file not in same order as raw read file")
            if readId3 == readId:
                break

        # debug check
        if readId3 != readId:
            raise Exception("3' and 5' cutadapt trim files out of sync")

        # skip to next read if 5' adapter not found
        if int(vals5[1]) == -1:
            continue

        # debug check
        if int(vals3[1]) == -1:
            raise Exception("trim of both 5' and 3' adapters expected")

        # fix line strip
        while len(vals3) < 11:
            vals3.append("")
        while len(vals5) < 11:
            vals5.append("")

        # avoid hassles of pysam type conversions for optional tags, by using SAM text format (might be slow)
        samVals = read.tostring(bam).split("\t")

        # get barcode region quality from raw read
        umiC = samVals[ 9][0:12]
        umiQ = samVals[10][0:12]
        if umiC != umi:
            raise Exception("unexpected barcode sync")

        # get trimmed sequences
        seq3  = vals3[5] + vals3[6]             # 6 should be empty string
        seq5  = umi  + vals5[4] + vals5[5]  # 4 should be empty string

        # get trimmed qual vals
        qual3 = vals3[9] + vals3[10]           # 10 should be empty string
        qual5 = umiQ + vals5[8] + vals5[9] #  8 should be empty string

        # debug check on read length
        readLenRaw = len(samVals[9])
        readLenTrm = len(seq5) + len(vals5[6]) + len(seq3)
        if readLenTrm != readLenRaw:
            raise Exception("Length of trimmed read not equal to raw read!")

        # get raw read flow signal tags
        outvec = [readId, seq5, qual5, seq3, qual3]
        for tag in samVals[11:]:
            if tag[0:2] in flowTagsNeeded:
                outvec.append(tag)

        # write to disk
        fileout.write("|".join(outvec))
        fileout.write("\n")

    # done
    fileout.close()
    fileIn3.close()
    bam.close()

    # sort the trimmed seq / flow tag file by read id
    cmd = "sort -k1,1 -t\| --parallel={0} {1}.tvc.flowtags.txt > {1}.tvc.flowtags.sorted.txt".format(numCpus,readSet)
    subprocess.check_call(cmd, shell=True)
    os.remove("{}.tvc.flowtags.txt".format(readSet))

    # sort oligoClip file by read id
    cmd = samtoolsDir + "samtools sort -n -m " + samtoolsMem + " -@" + numCpus \
    + " -T " + readSet \
    + " -o " + readSet + ".tvc.temp.bam " \
             + readSet + ".bam " \
    + " > "  + readSet + ".tvc.sort.log 2>&1 "
    subprocess.check_call(cmd, shell=True)

    # set up reverse comlement
    dnaComplementTranslation = string.maketrans("ATGC", "TACG")

    # open readId-sorted main BAM file, build header for TVC output bam
    bamIn = pysam.AlignmentFile(readSet + ".tvc.temp.bam", "rb")

    # dump bam header to sam file, because we are using older version of pysam that cannot directly take text lines for header
    headerTagsNeeded = set(["CO", "RG", "PG"])
    fileOut = open(readSet + ".tvc.header.sam", "w")
    for line in bamIn.text.split("\n"): # init with TMAP tags
        if len(line) > 3 and line[1:3] != "RG":
            fileOut.write(line)
            fileOut.write("\n")
    for line in bamHeaderRaw.split("\n"): # add Ion BaseCaller flow tags
        if line[1:3] in headerTagsNeeded:
            fileOut.write(line)
            fileOut.write("\n")
    fileOut.close()
    samHeaderOnly = pysam.AlignmentFile(readSet + ".tvc.header.sam", "r")

    # open output BAM file
    bamOut = pysam.AlignmentFile(readSet + ".tvc.bam", "wb", template=samHeaderOnly)
    samHeaderOnly.close()
    os.remove(readSet + ".tvc.header.sam")

    # make TVC input file - add hard clipped regions back as soft-clipped alignments
    fileIn = open(readSet + ".tvc.flowtags.sorted.txt","r")
    for read in bamIn:

        # drop fake R2 (primer side)
        if not read.is_read2:
            continue

        # change back to single end
        read.is_read1 = False
        read.is_read2 = False
        read.is_paired = False
        read.mate_is_reverse = False
        read.mate_is_unmapped = False

        # parse readId
        vals = read.query_name.split(":")
        #readIdBam = ":".join(vals[0:-2])
        readIdBam = read.query_name

        # spin the flowtag file forward (not all reads in the bam)
        readId = None
        while True:
            line = fileIn.readline()
            vals = line.strip().split("|")
            (readId, seq5, qual5, seq3, qual3) = vals[0:5]
            if readId == readIdBam:
                break

        # debug check
        if readId == None:
            raise Exception("missing read id in TVC flowtag merge")

        # handle negative strand alignment
        if read.is_reverse:
            tmp  = seq5
            seq5 = seq3
            seq3 = tmp
            seq5 = seq5[::-1]
            seq5 = seq5.translate(dnaComplementTranslation)
            seq3 = seq3[::-1]
            seq3 = seq3.translate(dnaComplementTranslation)
            tmp   = qual5[::-1]
            qual5 = qual3[::-1]
            qual3 = tmp

        # copy the cigar
        cigar = list(read.cigar)

        # add 5' trim back on
        (op, bases) = cigar[0]
        if op == 4:
            cigar[0] = (op, bases + len(seq5))
        else:
            cigar.insert(0,(4, len(seq5)))

        # add 3' trim back on
        (op, bases) = cigar[-1]
        if op == 4:
            cigar[-1] = (op, bases + len(seq3))
        else:
            cigar.append((4, len(seq3)))

        # save cigar edits
        read.cigar = cigar

        # pysam requires saving qual values first
        qual = read.qual

        # fix up the seq
        read.query_sequence = seq5 + read.query_sequence + seq3

        # fix up the quality
        read.qual = qual5 + qual + qual3

        # add flow quality tags
        for tag in vals[5:]:
            (tagName,tagType,tagVal) = tag.split(":")
            if tagType == "Z":
                pass
            elif tagType == "i":
                tagVal = int(tagVal)
            elif tagType == "B":
                if tagVal.startswith("f,"):
                    tagVal = array.array("f",[float(x) for x in tagVal[2:].split(",")])
                elif tagVal.startswith("i,"):
                    tagVal = array.array("i",[int(x)   for x in tagVal[2:].split(",")])
                elif tagVal.startswith("s,"):
                    tagVal = array.array("h",[int(x)   for x in tagVal[2:].split(",")])
                else:
                    raise Exception()
            else:
                raise Exception()
            read.set_tag(tagName,tagVal)

        # output modified read
        bamOut.write(read)

    # done
    fileIn.close()
    bamIn.close()
    bamOut.close()
    os.remove(readSet + ".tvc.temp.bam")

    # sort final TVC input bam
    cmd = samtoolsDir + "samtools sort -m " + samtoolsMem + " -@" + numCpus \
    + " -T " + readSet \
    + " -o " + readSet + ".tvc.sorted.bam " \
             + readSet + ".tvc.bam " \
    + " > "  + readSet + ".tvc.sort.log 2>&1 "
    subprocess.check_call(cmd, shell=True)

    # index final TVC input bam
    cmd = samtoolsDir + "samtools index " + readSet + ".tvc.sorted.bam"
    subprocess.check_call(cmd, shell=True)

    # run TVC
    roiBedFile = cfg.roiBedFile
    torrentBinDir     = cfg.torrentBinDir
    torrentGenomeFile = cfg.genomeFile
    torrentVcfFile = readSet + ".tvc.vcf"
    cmd = os.path.join(torrentBinDir , "tvc") + " --output-dir _TVC_ " \
     + " -n " + numCpus \
     + " -b " + readSet + ".tvc.sorted.bam" \
     + " -t " + roiBedFile \
     + " -r " + torrentGenomeFile \
     + " -o " + torrentVcfFile \
     + " --snp-min-allele-freq 0.005" \
     + " --snp-min-cov-each-strand 0 " \
     + " --snp-min-coverage 3" \
     + " --snp-min-var-coverage 2" \
     + " --snp-min-variant-score 6" \
     + " --snp-strand-bias 1" \
     + " --snp-strand-bias-pval 0" \
     + " --mnp-min-allele-freq 0.005" \
     + " --mnp-min-cov-each-strand 0" \
     + " --mnp-min-coverage 3" \
     + " --mnp-min-var-coverage 2" \
     + " --mnp-min-variant-score 6" \
     + " --mnp-strand-bias 1" \
     + " --mnp-strand-bias-pval 0" \
     + " --indel-min-allele-freq 0.05" \
     + " --indel-min-cov-each-strand 0" \
     + " --indel-min-coverage 3" \
     + " --indel-min-var-coverage 2" \
     + " --indel-min-variant-score 10" \
     + " --indel-strand-bias 1" \
     + " --indel-strand-bias-pval 0" \
     + " > " + readSet + ".tvc.log 2>&1"
    print("tvc: command line is " + cmd)
    subprocess.check_call(cmd, shell=True)
    print("tvc: done running TVC")

    # move TVC VCF to current directory
    os.rename("_TVC_/" + torrentVcfFile, torrentVcfFile)

    # call up vcflib command to split multi-allelic, remove GT tag, get primitives
    cmd = "{0}vcfbreakmulti {1}.tvc.vcf | " \
        + "{0}vcfkeepgeno - DP AF AD VF | " \
        + "{0}vcfallelicprimitives --tag-parsed AP > {1}.tvc.primitives.vcf 2> {1}.tvc.vcflib.log"
    cmd = cmd.format(vcflibDir,readSet)
    subprocess.check_call(cmd,shell=True)

    # (1) drop TVC primitive variants that have allele fraction below 0.05
    # (2) make BED file for smCounter - regions +/- 10 bp from a TVC primitive variant
    bedTvc = []
    fileout   = open(readSet + ".tvc.primitives.temp.vcf", "w")
    for line in open(readSet + ".tvc.primitives.vcf", "r"):

        # echo VCF header
        if line.startswith("#"):
            fileout.write(line)
            continue

        # parse line
        chrom, pos, id, ref, alt, qual,filter,info,format,sampleId = line.strip().split("\t")

        # make sure data is as expected
        if alt.find(",") >= 0:
            raise Exception("tvc: not expecting multi-allelic variant in primitives file")

        # get left location, zero-based
        locL = int(pos) - 1

        # look for indel, include right flanking base
        altLen = len(alt)
        refLen = len(ref)
        if altLen == refLen:  # SNP or MNP
            locR = locL + refLen
            isIndel = False
        else:  # INDEL
            locR = locL + refLen + 1
            isIndel = True

        # get AF tag - assume always present - exception will be thrown if None remains
        alleleFraction = None
        for tag in info.split(";"):
            if tag.find("=") > 0:
                tagName, tagVal = tag.split("=")
                if tagName == "AF":
                    if tagVal.find(",") >= 0:
                        raise Exception("tvc: not expecting TVC primitives to be multi-allelic")
                    alleleFraction = float(tagVal)

        # drop TVC primitive variants with low INDEL allele fraction
        if (isIndel and alleleFraction < 0.05) or alleleFraction < 0.005:
            continue

        # echo line to new TVC VCF primitives file
        fileout.write(line)

        # save region, with 10 bp flanking
        locL = max(0,locL - 10)
        locR += 10
        if chrom == "chrM" and locR > 16569:  # horrific hack for chrM NC_012920 reference
            locR = 16569
        if locL < locR:
            bedTvc.append((chrom,locL, locR))

    # close filtered TVC VCF primitives file, rename for later use
    fileout.close()
    os.rename(readSet + ".tvc.primitives.temp.vcf", readSet + ".tvc.primitives.vcf")

    # merge BED and write to disk
    bedTvc = bed.merge(bedTvc)
    bed.write(bedTvc, readSet + ".tvc_roi.bed")
    print("tvc: done running TVC and making smCounter ROI bed")

Example #2

Show file

def geneCov(gene, genePrimers, fragLen):
    '''
    get gene coverage using exon models and a max fragment length
    based on step01.py by John Dicarlo
    '''

    # intit bed coverage
    bedCovOneGene = []
    bedTrackSet = set()
    bedWarnings = []

    # loop over RNAs
    for rnaId in gene:
        rnaLen = 0
        bedExons = []

        # get exons
        firstExon = True
        for (geneName, strand, chrom, exonStart, exonEnd) in gene[rnaId]:
            exonStart = int(exonStart)
            exonEnd = int(exonEnd)

            if firstExon:
                geneLocL = exonStart
                geneLocR = exonEnd
                firstExon = False
            else:
                geneLocL = min(exonStart, geneLocL)
                geneLocR = max(exonEnd, geneLocR)

            rnaLen += exonEnd - exonStart
            bedExons.append((chrom, exonStart, exonEnd))

        bedExons.sort()

        # init coverage for this RNA
        bedCovOneRna = []

        # loop over primers, make RNA coverage BED tracks
        for (chrom, locDna5, locDna3, strand, primer) in genePrimers:
            locDna5 = int(locDna5)
            locDna3 = int(locDna3)
            strand = int(strand)

            # get primer RNA loc3
            exonsLen = 0
            locRna3 = None
            for (chrom, locL, locR) in bedExons:
                if locL <= locDna3 < locR:
                    locRna3 = exonsLen + locDna3 - locL
                    break
                exonsLen += (locR - locL)
            # check if primer match RNA
            if locRna3 == None:
                continue

            # get loc5 on RNA
            primerLen = len(primer)
            if strand == 0:
                locRna5 = locRna3 - primerLen + 1
            else:
                locRna5 = locRna3 + primerLen - 1

            # get DNA position of end of fragment
            if strand == 0:
                locRnaEnd = min(locRna5 + fragLen - 1, rnaLen - 1)
            else:
                locRnaEnd = max(locRna5 - fragLen + 1, 0)
            locL_ = 0
            locR_ = 0
            locDnaEnd = None
            for (chrom, locL, locR) in bedExons:
                locR_ += (locR - locL)
                if locL_ <= locRnaEnd < locR_:
                    locDnaEnd = locL + locRnaEnd - locL_
                    break
                locL_ = locR_
            if locDnaEnd == None:
                raise Exception()

            # frag coverage region
            if strand == 0:
                bedDelete = [(chrom, geneLocL, locDna3 + 1),
                             (chrom, locDnaEnd + 1, geneLocR)]
            else:
                bedDelete = [(chrom, geneLocL, locDnaEnd),
                             (chrom, locDna3, geneLocR)]
            bedCov = bed.subtract(bedExons, bedDelete)

            # save coverage across whole RNA
            bedCovOneRna.extend(bedCov)

            # make subtraction bed for full frag, including primer
            if strand == 0:
                bedDelete = [(chrom, geneLocL, locDna5),
                             (chrom, locDnaEnd + 1, geneLocR)]
            else:
                bedDelete = [(chrom, geneLocL, locDnaEnd),
                             (chrom, locDna5 + 1, geneLocR)]

            # do bed subtraction to get enrichment frag
            bedFrag = bed.subtract(bedExons, bedDelete)
            bedFrag = bed.merge(bedFrag)  # should not do anything

            # get size of enrichment frag (might be less than fragLen at ends of RNA)
            bpFrag = sum((x[2] - x[1] for x in bedFrag))

            # convert bedFrag to a one-row bed
            bedLocL = bedFrag[0][1]
            bedLocR = bedFrag[-1][2]
            if strand == 0:
                bedStrand = "+"
                bedThickStart = locDna3 + 1
                bedThickStop = bedLocR
            else:
                bedStrand = "-"
                bedThickStart = bedLocL
                bedThickStop = locDna3
            if bedThickStart >= bedThickStop:
                bedWarnings.append(
                    (chrom, locDna5, locDna3, strand, primer, geneName, rnaId,
                     bedThickStart, bedThickStop))
            numBlocks = len(bedFrag)
            blockSizes = ",".join([str(x[2] - x[1]) for x in bedFrag])
            blockStarts = ",".join([str(x[1] - bedLocL) for x in bedFrag])
            bedScore = 0
            bedOne = (chrom, bedLocL, bedLocR, geneName, bedScore, bedStrand,
                      bedThickStart, bedThickStop, 0, numBlocks, blockSizes,
                      blockStarts)
            # bedOne = (chrom, bedLocL, bedLocR, bpFrag, bedScore, bedStrand, bedThickStart, bedThickStop, 0, numBlocks, blockSizes, blockStarts)
            bedTrackSet.add(bedOne)

        # update BED for all RNAs coverage
        bedCovOneGene.extend(bedCovOneRna)

    # post processing
    bedCovOneGene.sort()
    bedTrackSet = list(bedTrackSet)
    bedTrackSet.sort()

    return (bedCovOneGene, bedTrackSet, bedWarnings)