def main(args):
    # debugging info
    logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName + ".mergepairs.log"
    logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG)

    logging.info("\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s"
                 % (args.inDir1,args.inDir2,args.outBaseName,args.configFileName))

    # create output directory
    pickreads.prepOutDir(args.outBaseName,args.outDirName,args.overwrite)

    # make sure input sources exist
    peakparser.checkOutDir(args.inDir1,args.outDirName)
    peakparser.checkOutDir(args.inDir2,args.outDirName)

    # make sure config files exist
    configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName
    configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName
    pickreads.checkfile(configPath1)
    pickreads.checkfile(configPath2)

    # read parameters for both inputs
    configDict1 = peakparser.readConfig(configPath1,args.inDir1,args.outDirName)
    configDict2 = peakparser.readConfig(configPath2,args.inDir2,args.outDirName)

    maxDist = int(configDict1['insertSize']) + 2*int(configDict1['readLength'])

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt"
    readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt"
    logging.info("merging readfiles (%s, %s)" % (readFileName1,readFileName2))
#   print "merging readfiles (%s, %s)" % (readFileName1,readFileName2)
    mergeChrPosFiles(readFileName1,readFileName2,outReadFileName,maxDist)

    # merge bedfiles
    outBedFileName = args.outDirName +  "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed"
    bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed"
    bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed"
    logging.info("merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2))
#   print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2)
    mergeChrPosFiles(bedFileName1,bedFileName2,outBedFileName,maxDist)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = configDict1
    configDict['bamFileName1'] = configDict1['bamFileName']
    configDict['bamFileName2'] = configDict2['bamFileName']
    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k,v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()
Exemple #2
0
def main(args):

    # create output directory
    pickreads.prepOutDir(args.outBaseName,args.outDirName,args.overwrite)

    pickreads.checkfile(args.sampleListFile)

    sampleList = open(args.sampleListFile, 'r')

    readFileNames = []
    bamFileNames  = []
    insertSizes   = []
    readLengths   = []
    lastConfig    = None

    for sampleLine in sampleList:
        if not re.search("^#", sampleLine):
            (sampleBam,sampleSubDir,refGenome) = sampleLine.strip().split()
            peakparser.checkOutDir(sampleSubDir,args.outDirName)
            configPath = args.outDirName + "/" + sampleSubDir + "/" + args.configFileName

            pickreads.checkfile(configPath)
            configDict = peakparser.readConfig(configPath,sampleSubDir,args.outDirName)
            lastConfig = configDict

            insertSizes.append(int(configDict['insertSize']))
            readLengths.append(int(configDict['readLength']))

            readFileName = args.outDirName + "/" + sampleSubDir + "/" + sampleSubDir + ".readpairs.txt"
            readFileNames.append(readFileName)
            bamFileNames.append(configDict['bamFileName'])

    maxDist = max(insertSizes) + 2*max(readLengths)        

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    mergeChrPosFiles(readFileNames,outReadFileName,maxDist)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = lastConfig

    bfnum = 0
    for bamFileName in bamFileNames:
        bfvname = "bamFileName" + str(bfnum)
        configDict[bfvname] = bamFileName
        bfnum += 1

    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k,v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()
Exemple #3
0
def main(args):
    checkOutDir(args.outBaseName, args.outDirName)
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    pickreads.checkfile(configPath)
    configDict = readConfig(configPath, args.outBaseName, args.outDirName)

    # debugging info
    logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid(
    ) + "." + args.outBaseName + ".peakparser.log"
    logging.basicConfig(format='%(asctime)s %(message)s',
                        filename=logfile,
                        level=logging.DEBUG)

    # log parameters
    logging.info("\noutBaseName=%s\nbamFileName=%s\nexonTabixFile=%s" %
                 (args.outBaseName, configDict['readFileName'],
                  configDict['exonTabixFile']))

    pickreads.checkfile(configDict['readFileName'])
    pickreads.checkfile(configDict['exonTabixFile'])
    pickreads.checkfile(args.pgTabixFile)

    tabixFile = pysam.Tabixfile(configDict['exonTabixFile'], 'r')
    pgTabix = pysam.Tabixfile(args.pgTabixFile, 'r')

    pgTabixContigs = {}
    for contig in pgTabix.contigs:
        pgTabixContigs[contig] = 1

    # structure for keeping element classes seperate
    geneNameDict = {}

    f = open(configDict['readFileName'])
    for line in f:
        fields = line.rsplit("\t")
        chrom = fields[0]
        readPos = int(fields[1])
        readStrand = fields[2]
        mateElt = fields[3]
        mateChrom = fields[4]
        mateStrand = fields[5]
        matePos = int(fields[6])
        eltStart = int(fields[7])
        eltEnd = int(fields[8])
        eltStrand = fields[9]
        eltFullLen = int(fields[10])
        genomeName = fields[11]
        peakIndex = fields[12]
        geneName = fields[13].strip()

        if not geneNameDict.has_key(geneName):
            geneNameDict[geneName] = PeakBuilder(geneName)

        pair = pairinfo.Pair(chrom, readPos, readStrand, mateElt, mateChrom,
                             mateStrand, matePos, eltStart, eltEnd, eltStrand,
                             eltFullLen, genomeName, peakIndex, geneName)
        if geneNameDict[geneName].lastPeakNum != peakIndex:
            geneNameDict[geneName].peaks.append(geneNameDict[geneName].peak)
            geneNameDict[geneName].peak = pairinfo.Peak()
            geneNameDict[geneName].peak.addpair(pair)
            geneNameDict[geneName].lastPeakNum = peakIndex
        else:
            geneNameDict[geneName].peak.addpair(pair)

    for geneName in geneNameDict.keys():
        geneNameDict[geneName].peaks.append(geneNameDict[geneName].peak)

    logging.info("starting long output...")
    longOutput(geneNameDict, args.outBaseName, args.outDirName, pgTabix,
               pgTabixContigs)
Exemple #4
0
def main(args):
    # debugging info
    logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid(
    ) + "." + args.outBaseName + ".mergepairs.log"
    logging.basicConfig(format='%(asctime)s %(message)s',
                        filename=logfile,
                        level=logging.DEBUG)

    logging.info(
        "\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s" %
        (args.inDir1, args.inDir2, args.outBaseName, args.configFileName))

    # create output directory
    pickreads.prepOutDir(args.outBaseName, args.outDirName, args.overwrite)

    # make sure input sources exist
    peakparser.checkOutDir(args.inDir1, args.outDirName)
    peakparser.checkOutDir(args.inDir2, args.outDirName)

    # make sure config files exist
    configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName
    configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName
    pickreads.checkfile(configPath1)
    pickreads.checkfile(configPath2)

    # read parameters for both inputs
    configDict1 = peakparser.readConfig(configPath1, args.inDir1,
                                        args.outDirName)
    configDict2 = peakparser.readConfig(configPath2, args.inDir2,
                                        args.outDirName)

    maxDist = int(
        configDict1['insertSize']) + 2 * int(configDict1['readLength'])

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt"
    readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt"
    logging.info("merging readfiles (%s, %s)" % (readFileName1, readFileName2))
    #   print "merging readfiles (%s, %s)" % (readFileName1,readFileName2)
    mergeChrPosFiles(readFileName1, readFileName2, outReadFileName, maxDist)

    # merge bedfiles
    outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed"
    bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed"
    bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed"
    logging.info("merging bedfiles (%s,%s)" % (bedFileName1, bedFileName2))
    #   print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2)
    mergeChrPosFiles(bedFileName1, bedFileName2, outBedFileName, maxDist)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = configDict1
    configDict['bamFileName1'] = configDict1['bamFileName']
    configDict['bamFileName2'] = configDict2['bamFileName']
    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k, v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()
def main(args):
    checkOutDir(args.outBaseName,args.outDirName)
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    pickreads.checkfile(configPath)
    configDict = readConfig(configPath,args.outBaseName,args.outDirName)

    # debugging info
    logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName +".peakparser.log"
    logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG)

    # log parameters
    logging.info("\noutBaseName=%s\nbamFileName=%s\nexonTabixFile=%s"
                 % (args.outBaseName,configDict['readFileName'],configDict['exonTabixFile']))

    pickreads.checkfile(configDict['readFileName'])
    pickreads.checkfile(configDict['exonTabixFile'])
    pickreads.checkfile(args.pgTabixFile)

    tabixFile = pysam.Tabixfile(configDict['exonTabixFile'], 'r')
    pgTabix = pysam.Tabixfile(args.pgTabixFile, 'r')

    pgTabixContigs = {}
    for contig in pgTabix.contigs:
        pgTabixContigs[contig] = 1

    # structure for keeping element classes seperate
    geneNameDict = {}

    f = open(configDict['readFileName'])
    for line in f:
        fields = line.rsplit("\t")
        chrom      = fields[0]
        readPos    = int(fields[1])
        readStrand = fields[2]
        mateElt    = fields[3]
        mateChrom  = fields[4]
        mateStrand = fields[5]
        matePos    = int(fields[6])
        eltStart   = int(fields[7])
        eltEnd     = int(fields[8])
        eltStrand  = fields[9]
        eltFullLen = int(fields[10])
        genomeName = fields[11]
        peakIndex  = fields[12]
        geneName   = fields[13].strip()

        if not geneNameDict.has_key(geneName):
            geneNameDict[geneName] = PeakBuilder(geneName);

        pair = pairinfo.Pair(chrom,readPos,readStrand,mateElt,mateChrom,
                             mateStrand,matePos,eltStart,eltEnd,eltStrand,
                             eltFullLen,genomeName,peakIndex,geneName)
        if geneNameDict[geneName].lastPeakNum != peakIndex:
            geneNameDict[geneName].peaks.append(geneNameDict[geneName].peak)
            geneNameDict[geneName].peak = pairinfo.Peak()
            geneNameDict[geneName].peak.addpair(pair)
            geneNameDict[geneName].lastPeakNum = peakIndex
        else:
            geneNameDict[geneName].peak.addpair(pair)

    for geneName in geneNameDict.keys():
        geneNameDict[geneName].peaks.append(geneNameDict[geneName].peak)

    logging.info("starting long output...")
    longOutput(geneNameDict,args.outBaseName,args.outDirName,pgTabix,pgTabixContigs)