def main(args):
    # debugging info
    logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName + ".mergepairs.log"
    logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG)

    logging.info("\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s"
                 % (args.inDir1,args.inDir2,args.outBaseName,args.configFileName))

    # create output directory
    discordant.prepOutDir(args.outBaseName,args.outDirName,args.overwrite)

    # make sure input sources exist
    peakparser.checkOutDir(args.inDir1,args.outDirName)
    peakparser.checkOutDir(args.inDir2,args.outDirName)

    # make sure config files exist
    configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName
    configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName
    discordant.checkfile(configPath1)
    discordant.checkfile(configPath2)

    # read parameters for both inputs
    configDict1 = peakparser.readConfig(configPath1,args.inDir1,args.outDirName)
    configDict2 = peakparser.readConfig(configPath2,args.inDir2,args.outDirName)

    maxDist = int(configDict1['insertSize']) + 2*int(configDict1['readLength'])
    eltLenDict = discordant.makeEltLenDict(configDict1['eltLenFileName'])

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt"
    readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt"
    logging.info("merging readfiles (%s, %s)" % (readFileName1,readFileName2))
#   print "merging readfiles (%s, %s)" % (readFileName1,readFileName2)
    mergeChrPosFiles(readFileName1,readFileName2,outReadFileName,maxDist,eltLenDict)

    # merge bedfiles
    outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed"
    bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed"
    bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed"
    logging.info("merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2))
#   print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2)
    mergeChrPosFiles(bedFileName1,bedFileName2,outBedFileName,maxDist,eltLenDict)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = configDict1
    configDict['bamFileName1'] = configDict1['bamFileName']
    configDict['bamFileName2'] = configDict2['bamFileName']
    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k,v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()
Exemple #2
0
def main(args):

    # create output directory
    pickreads.prepOutDir(args.outBaseName,args.outDirName,args.overwrite)

    pickreads.checkfile(args.sampleListFile)

    sampleList = open(args.sampleListFile, 'r')

    readFileNames = []
    bamFileNames  = []
    insertSizes   = []
    readLengths   = []
    lastConfig    = None

    for sampleLine in sampleList:
        if not re.search("^#", sampleLine):
            (sampleBam,sampleSubDir,refGenome) = sampleLine.strip().split()
            peakparser.checkOutDir(sampleSubDir,args.outDirName)
            configPath = args.outDirName + "/" + sampleSubDir + "/" + args.configFileName

            pickreads.checkfile(configPath)
            configDict = peakparser.readConfig(configPath,sampleSubDir,args.outDirName)
            lastConfig = configDict

            insertSizes.append(int(configDict['insertSize']))
            readLengths.append(int(configDict['readLength']))

            readFileName = args.outDirName + "/" + sampleSubDir + "/" + sampleSubDir + ".readpairs.txt"
            readFileNames.append(readFileName)
            bamFileNames.append(configDict['bamFileName'])

    maxDist = max(insertSizes) + 2*max(readLengths)        

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    mergeChrPosFiles(readFileNames,outReadFileName,maxDist)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = lastConfig

    bfnum = 0
    for bamFileName in bamFileNames:
        bfvname = "bamFileName" + str(bfnum)
        configDict[bfvname] = bamFileName
        bfnum += 1

    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k,v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()
Exemple #3
0
def main(args):
    peakparser.checkOutDir(args.outBaseName,args.outDirName)

    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    discordant.checkfile(configPath)
    configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName)

    refGenome = configDict['refGenome']
    checkAnnotDir(refGenome, args.annotDir)

    debugFile = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".debug.txt"
    discordant.checkfile(debugFile)

    # can filter on retroelement subfamilies
    discordant.checkfile(args.eltFile)
    eltDict = {}
    f = open(args.eltFile,'r')
    for elt in f:
        eltDict[elt.strip()]=1

    germlineIns = []
    cancerIns   = []
    normalIns   = []
    otherIns    = []

    f = open(debugFile, 'r')
    for line in f: # loop over lines
        if re.search('valid=1', line) and (re.search('olClN=0', line) or args.allowOverlap):
            insdata = {}
            for c in line.rstrip().rsplit(' '): # loop over columns
               (key,value) = c.rsplit('=')
               insdata[key] = value

            if int(insdata['np']) >= int(args.minPeakSize) and eltDict.has_key(insdata['pME']): 
                insSum = insertionSummary()
                insSum.setPos(insdata['pos'])
                insSum.setEltPos(insdata['eltExt'])
                insSum.setBestStrand(insdata['eInv'],insdata['eS'])
                insSum.setSources(insdata['sources'])
                insSum.setClassIndex(insdata['index'])
                insSum.eltFam   = insdata['pME']
                insSum.numreads = insdata['np']
                if len(insSum.sources) > 1:
                    germlineIns.append(insSum)
                else:
                    if re.search('CANCER', insSum.sources[0]):
                        cancerIns.append(insSum)
                    elif re.search('NORMAL', insSum.sources[0]):
                        normalIns.append(insSum)
                    else:
                        otherIns.append(insSum)
    f.close()
    annotatePos(germlineIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/germline.tab.txt",args.printout)
    annotatePos(cancerIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt",args.printout)
    annotatePos(normalIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt",args.printout)
    annotatePos(otherIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/other.tab.txt",args.printout)
def main(args):
    peakparser.checkOutDir(args.outBaseName,args.outDirName)

    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    checkfile(configPath)
    configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName)

    refGenome = configDict['refGenome']
    checkAnnotDir(refGenome,args.annotDir)

    debugFile = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".debug.txt"
    checkfile(debugFile)

    germlineIns = []
    cancerIns   = []
    normalIns   = []
    uncatIns    = []

    f = open(debugFile, 'r')
    for line in f: # loop over lines
        if re.search('valid=1', line):
            insdata = {}
            for c in line.rstrip().rsplit(' '): # loop over columns
               (key,value) = c.rsplit('=')
               insdata[key] = value

            if int(insdata['np']) >= 8 and int(insdata['nEx']) > 1: 
                insSum = insertionSummary()
                insSum.setPos(insdata['pos'])
                insSum.setEltPos(insdata['eltExt'])
                insSum.setBestStrand(insdata['eInv'],insdata['eS'])
                insSum.setSources(insdata['sources'])
                insSum.index    = insdata['index']
                insSum.priGene  = insdata['pGene']
                insSum.exonStr  = insdata['exT']
                insSum.eltFam   = insdata['pME']
                insSum.numreads = insdata['np']
                insSum.pseudo   = insdata['pgO']
                if len(insSum.sources) > 1:
                    germlineIns.append(insSum)
                else:
                    if re.search('CANCER', insSum.sources[0]):
                        cancerIns.append(insSum)
                    elif re.search('NORMAL', insSum.sources[0]):
                        normalIns.append(insSum)
                    else:
                        uncatIns.append(insSum)
    f.close()
    annotatePos(germlineIns,refGenome,args.outDirName + "/" + args.outBaseName + "/germline.tab.txt",args.annotDir,args.printout)
    annotatePos(cancerIns,refGenome,args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt",args.annotDir,args.printout)
    annotatePos(normalIns,refGenome,args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt",args.annotDir,args.printout)
    annotatePos(uncatIns,refGenome,args.outDirName + "/" + args.outBaseName + "/uncategorized.tab.txt",args.annotDir,args.printout)
Exemple #5
0
def main(args):
    # debugging info
    logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid(
    ) + "." + args.outBaseName + ".mergepairs.log"
    logging.basicConfig(format='%(asctime)s %(message)s',
                        filename=logfile,
                        level=logging.DEBUG)

    logging.info(
        "\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s" %
        (args.inDir1, args.inDir2, args.outBaseName, args.configFileName))

    # create output directory
    pickreads.prepOutDir(args.outBaseName, args.outDirName, args.overwrite)

    # make sure input sources exist
    peakparser.checkOutDir(args.inDir1, args.outDirName)
    peakparser.checkOutDir(args.inDir2, args.outDirName)

    # make sure config files exist
    configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName
    configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName
    pickreads.checkfile(configPath1)
    pickreads.checkfile(configPath2)

    # read parameters for both inputs
    configDict1 = peakparser.readConfig(configPath1, args.inDir1,
                                        args.outDirName)
    configDict2 = peakparser.readConfig(configPath2, args.inDir2,
                                        args.outDirName)

    maxDist = int(
        configDict1['insertSize']) + 2 * int(configDict1['readLength'])

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt"
    readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt"
    logging.info("merging readfiles (%s, %s)" % (readFileName1, readFileName2))
    #   print "merging readfiles (%s, %s)" % (readFileName1,readFileName2)
    mergeChrPosFiles(readFileName1, readFileName2, outReadFileName, maxDist)

    # merge bedfiles
    outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed"
    bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed"
    bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed"
    logging.info("merging bedfiles (%s,%s)" % (bedFileName1, bedFileName2))
    #   print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2)
    mergeChrPosFiles(bedFileName1, bedFileName2, outBedFileName, maxDist)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = configDict1
    configDict['bamFileName1'] = configDict1['bamFileName']
    configDict['bamFileName2'] = configDict2['bamFileName']
    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k, v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()
Exemple #6
0
def main(args):
    peakparser.checkOutDir(args.outBaseName, args.outDirName)

    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    checkfile(configPath)
    configDict = peakparser.readConfig(configPath, args.outBaseName,
                                       args.outDirName)

    # load hash of TranscriptSeq objects (genename --> multiple transcripts)
    # will need to pass to partialMapTx() later
    sys.stderr.write("loading " + args.mrnaFastaFile + "...\n")
    checkfile(args.mrnaFastaFile)
    txs = fastahash(args.mrnaFastaFile)

    cancerBamFile = ''
    normalBamFile = ''
    refGenomeFile = args.refGenomeFile

    cancerCallsFile = args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt"
    normalCallsFile = args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt"
    germCallsFile = args.outDirName + "/" + args.outBaseName + "/germline.tab.txt"
    otherCallsFile = args.outDirName + "/" + args.outBaseName + "/uncategorized.tab.txt"

    # fix if unmerged
    if not configDict.has_key('bamFileName1'):
        configDict['bamFileName1'] = configDict['bamFileName']
        configDict['bamFileName2'] = configDict['bamFileName']

    bamType1 = getTypeFromTCGA(configDict['bamFileName1'])
    bamType2 = getTypeFromTCGA(configDict['bamFileName2'])

    print("bamfile1=%s bamFile2=%s bamType1=%s bamType2=%s" %
          (configDict['bamFileName1'], configDict['bamFileName2'], bamType1,
           bamType2))

    if bamType1 != bamType2 and bamType1 != None and bamType2 != None:
        if bamType1 == 'CANCER':
            if bamType2 != 'NORMAL':
                raise NameError('bam1 is cancer but bam2 is not normal')
            cancerBamFile = configDict['bamFileName1']
            normalBamFile = configDict['bamFileName2']
        if bamType2 == 'CANCER':
            if bamType1 != 'NORMAL':
                raise NameError('bam2 is cancer but bam1 is not normal')
            cancerBamFile = configDict['bamFileName2']
            normalBamFile = configDict['bamFileName1']
    else:
        print 'cannot determine bamfile cancer/normal from filenames in config.txt, defaulting to normal.'
        normalBamFile = configDict['bamFileName1']
        cancerBamFile = configDict['bamFileName2']

    checkfile(cancerBamFile)
    checkfile(normalBamFile)
    checkfile(normalCallsFile)
    checkfile(cancerCallsFile)
    checkfile(germCallsFile)
    checkfile(otherCallsFile)
    checkfile(refGenomeFile)

    cancerBam = pysam.Samfile(cancerBamFile, 'rb')  # rb = read, binary
    normalBam = pysam.Samfile(normalBamFile, 'rb')  # rb = read, binary
    cancerCalls = open(cancerCallsFile, 'r')
    normalCalls = open(normalCallsFile, 'r')
    germCalls = open(germCallsFile, 'r')
    otherCalls = open(otherCallsFile, 'r')
    refGenome = pysam.Fastafile(refGenomeFile)

    cancerBreaksOut = open(
        args.outDirName + "/" + args.outBaseName + "/cancerbreaks.tab.txt",
        'w')
    normalBreaksOut = open(
        args.outDirName + "/" + args.outBaseName + "/normalbreaks.tab.txt",
        'w')
    germBreaksOut = open(
        args.outDirName + "/" + args.outBaseName + "/germlinebreaks.tab.txt",
        'w')
    otherBreaksOut = open(
        args.outDirName + "/" + args.outBaseName +
        "/uncategorizedbreaks.tab.txt", 'w')

    callSetListNames = ('cancer', 'normal', 'germ', 'other')
    callSetListInFiles = (cancerCalls, normalCalls, germCalls, otherCalls)
    callSetListOutFiles = (cancerBreaksOut, normalBreaksOut, germBreaksOut,
                           otherBreaksOut)

    for i in range(len(callSetListNames)):
        for line in callSetListInFiles[i]:
            col = line.strip().split("\t")
            chr = col[0]
            start = int(col[1])
            end = int(col[2])
            gene = col[7]

            cancerCluster = fetchRegion(cancerBam, refGenome,
                                        int(args.maxReadLen), chr, start, end,
                                        gene, args.zeroChar,
                                        int(args.minClipQual), args.usechr)
            cancerCluster.type = 'CANCER'

            normalCluster = fetchRegion(normalBam, refGenome,
                                        int(args.maxReadLen), chr, start, end,
                                        gene, args.zeroChar,
                                        int(args.minClipQual), args.usechr)
            normalCluster.type = 'NORMAL'

            mergeCluster = mergeClusters(cancerCluster, normalCluster, txs)
            clusterout = mergeCluster.outstring()
            infodumpout = mergeCluster.infodump()

            callSetListOutFiles[i].write(
                line.strip("\n") + "\t" + clusterout + "\n" + infodumpout +
                "\n")
        callSetListInFiles[i].close()
        callSetListOutFiles[i].close()
Exemple #7
0
def main(args):
    peakparser.checkOutDir(args.outBaseName,args.outDirName)

    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    checkfile(configPath)
    configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName)

    cancerBamFile = ''
    normalBamFile = ''
    refGenomeFile = args.refGenomeFile 

    cancerCallsFile = args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt"
    normalCallsFile = args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt"
    germCallsFile   = args.outDirName + "/" + args.outBaseName + "/germline.tab.txt"
    otherCallsFile  = args.outDirName + "/" + args.outBaseName + "/other.tab.txt"

    # fix if unmerged
    if not configDict.has_key('bamFileName1'):
        configDict['bamFileName1'] = configDict['bamFileName']
        configDict['bamFileName2'] = configDict['bamFileName']

    bamType1 = getTypeFromTCGA(configDict['bamFileName1']) 
    bamType2 = getTypeFromTCGA(configDict['bamFileName2'])

    print ("bamfile1=%s bamFile2=%s bamType1=%s bamType2=%s"
        % (configDict['bamFileName1'], configDict['bamFileName2'], bamType1, bamType2))

    if bamType1 != bamType2 and bamType1 != None and bamType2 != None:
        if bamType1 == 'CANCER':
            if bamType2 != 'NORMAL':
                raise NameError('bam1 is cancer but bam2 is not normal')
            cancerBamFile = configDict['bamFileName1']
            normalBamFile = configDict['bamFileName2']
        if bamType2 == 'CANCER':
            if bamType1 != 'NORMAL':
                raise NameError('bam2 is cancer but bam1 is not normal')
            cancerBamFile = configDict['bamFileName2']
            normalBamFile = configDict['bamFileName1']
    else:
        print 'cannot determine bamfile cancer/normal from filenames in config.txt, defaulting to normal.'
        normalBamFile = configDict['bamFileName1']
        cancerBamFile = configDict['bamFileName2']

    checkfile(cancerBamFile)
    checkfile(normalBamFile)
    checkfile(normalCallsFile)
    checkfile(cancerCallsFile)
    checkfile(germCallsFile)
    checkfile(otherCallsFile)
    checkfile(refGenomeFile)

    cancerBam   = pysam.Samfile(cancerBamFile, 'rb') # rb = read, binary
    normalBam   = pysam.Samfile(normalBamFile, 'rb') # rb = read, binary
    cancerCalls = open(cancerCallsFile, 'r')
    normalCalls = open(normalCallsFile, 'r')
    germCalls   = open(germCallsFile, 'r')
    otherCalls  = open(otherCallsFile, 'r')
    refGenome   = pysam.Fastafile(refGenomeFile)

    cancerBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/cancerbreaks.tab.txt", 'w') 
    normalBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/normalbreaks.tab.txt", 'w')
    germBreaksOut   = open(args.outDirName + "/" + args.outBaseName + "/germlinebreaks.tab.txt", 'w')
    otherBreaksOut  = open(args.outDirName + "/" + args.outBaseName + "/otherbreaks.tab.txt", 'w')

    callSetListNames   = ('cancer', 'normal', 'germ','other')
    callSetListInFiles = (cancerCalls, normalCalls, germCalls, otherCalls)
    callSetListOutFiles = (cancerBreaksOut, normalBreaksOut, germBreaksOut, otherBreaksOut)

    for i in range(len(callSetListNames)):
        for line in callSetListInFiles[i]:
            col    = line.strip().split("\t")
            chr    = col[0]
            start  = int(col[1])
            end    = int(col[2])

            cancerCluster = fetchRegion(cancerBam,refGenome,int(args.maxReadLen),chr,start,end,args.zeroChar,int(args.minClipQual),args.usechr)
            cancerCluster.type='CANCER'

            normalCluster = fetchRegion(normalBam,refGenome,int(args.maxReadLen),chr,start,end,args.zeroChar,int(args.minClipQual),args.usechr)
            normalCluster.type='NORMAL'

            mergeCluster = mergeClusters(cancerCluster,normalCluster,args.refFastaDir)
            clusterout = mergeCluster.outstring()
            infodumpout = mergeCluster.infodump()

            callSetListOutFiles[i].write(line.strip("\n") + "\t" + clusterout + "\n" + infodumpout + "\n")
        callSetListInFiles[i].close()
        callSetListOutFiles[i].close()
def main(args):
    peakparser.checkOutDir(args.outBaseName,args.outDirName)

    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    checkfile(configPath)
    configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName)

    cancerBamFile = ''
    normalBamFile = ''
    refGenomeFile = args.refGenomeFile 

    cancerCallsFile = args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt"
    normalCallsFile = args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt"
    germCallsFile   = args.outDirName + "/" + args.outBaseName + "/germline.tab.txt"
    otherCallsFile  = args.outDirName + "/" + args.outBaseName + "/other.tab.txt"

    bamFiles = []
    sampleNames = []
    for key in configDict.keys():
        if key.startswith('bamFileName'):
            bamFiles.append(configDict[key])
            samplekey = "sampleName" + re.sub('bamFileName','',key)
            sampleNames.append(configDict[samplekey])

    print "finding breakpoints from bam files:"
    for i in range(len(bamFiles)):
        print bamFiles[i], sampleNames[i]

    for bamFile in bamFiles:
        checkfile(bamFile)
    checkfile(normalCallsFile)
    checkfile(cancerCallsFile)
    checkfile(germCallsFile)
    checkfile(otherCallsFile)
    checkfile(refGenomeFile)

    bamPysams = []
    for bamFile in bamFiles:
        bamPysam = pysam.Samfile(bamFile, 'rb')
        bamPysams.append(bamPysam)

    cancerCalls = open(cancerCallsFile, 'r')
    normalCalls = open(normalCallsFile, 'r')
    germCalls   = open(germCallsFile, 'r')
    otherCalls  = open(otherCallsFile, 'r')
    refGenome   = pysam.Fastafile(refGenomeFile)

    cancerBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/cancerbreaks.tab.txt", 'w') 
    normalBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/normalbreaks.tab.txt", 'w')
    germBreaksOut   = open(args.outDirName + "/" + args.outBaseName + "/germlinebreaks.tab.txt", 'w')
    otherBreaksOut  = open(args.outDirName + "/" + args.outBaseName + "/otherbreaks.tab.txt", 'w')

    callSetListNames   = ('cancer', 'normal', 'germ','other')
    callSetListInFiles = (cancerCalls, normalCalls, germCalls, otherCalls)
    callSetListOutFiles = (cancerBreaksOut, normalBreaksOut, germBreaksOut, otherBreaksOut)

    for i in range(len(callSetListNames)):
        for line in callSetListInFiles[i]:
            col    = line.strip().split("\t")
            chr    = col[0]
            start  = int(col[1])
            end    = int(col[2])

            mergedCluster = fetchRegion(bamPysams[0],refGenome,int(args.maxReadLen),chr,start,end,args.zeroChar,int(args.minClipQual),args.usechr)
            mergedCluster.type=sampleNames[0]

            for p in range(1,len(bamPysams)):
                nextCluster = fetchRegion(bamPysams[p],refGenome,int(args.maxReadLen),chr,start,end,args.zeroChar,int(args.minClipQual),args.usechr)
                nextCluster.type=sampleNames[p]
                mergedCluster = mergeClusters(mergedCluster,nextCluster,args.refFastaDir)

            clusterout = mergedCluster.outstring()
            infodumpout = mergedCluster.infodump()

            callSetListOutFiles[i].write(line.strip("\n") + "\t" + clusterout + "\n" + infodumpout + "\n")
        callSetListInFiles[i].close()
        callSetListOutFiles[i].close()
Exemple #9
0
def main(args):
    peakparser.checkOutDir(args.outBaseName, args.outDirName)

    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    checkfile(configPath)
    configDict = peakparser.readConfig(configPath, args.outBaseName,
                                       args.outDirName)

    refGenome = configDict['refGenome']
    checkAnnotDir(refGenome, args.annotDir)

    debugFile = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".debug.txt"
    checkfile(debugFile)

    germlineIns = []
    cancerIns = []
    normalIns = []
    uncatIns = []

    f = open(debugFile, 'r')
    for line in f:  # loop over lines
        if re.search('valid=1', line):
            insdata = {}
            for c in line.rstrip().rsplit(' '):  # loop over columns
                (key, value) = c.rsplit('=')
                insdata[key] = value

            if int(insdata['np']) >= int(args.minPeakSize) and int(
                    insdata['nEx']) > 1:
                insSum = insertionSummary()
                insSum.setPos(insdata['pos'])
                insSum.setEltPos(insdata['eltExt'])
                insSum.setBestStrand(insdata['eInv'], insdata['eS'])
                insSum.setSources(insdata['sources'])
                insSum.index = insdata['index']
                insSum.priGene = insdata['pGene']
                insSum.exonStr = insdata['exT']
                insSum.eltFam = insdata['pME']
                insSum.numreads = insdata['np']
                insSum.pseudo = insdata['pgO']
                if len(insSum.sources) > 1:
                    germlineIns.append(insSum)
                else:
                    if re.search('CANCER', insSum.sources[0]):
                        cancerIns.append(insSum)
                    elif re.search('NORMAL', insSum.sources[0]):
                        normalIns.append(insSum)
                    else:
                        uncatIns.append(insSum)
    f.close()
    annotatePos(germlineIns, refGenome,
                args.outDirName + "/" + args.outBaseName + "/germline.tab.txt",
                args.annotDir, args.printout)
    annotatePos(
        cancerIns, refGenome,
        args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt",
        args.annotDir, args.printout)
    annotatePos(
        normalIns, refGenome,
        args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt",
        args.annotDir, args.printout)
    annotatePos(
        uncatIns, refGenome,
        args.outDirName + "/" + args.outBaseName + "/uncategorized.tab.txt",
        args.annotDir, args.printout)
Exemple #10
0
def main(args):

    # create output directory
    discordant.prepOutDir(args.outBaseName,args.outDirName,args.overwrite)

    sampleList = [] 

    try:
        sampleList = args.sampleList
    except AttributeError:
        discordant.checkfile(args.sampleListFile)
        sampleList = open(args.sampleListFile, 'r') 

    readFileNames = []
    bamFileNames  = []
    sampleNames   = []
    insertSizes   = []
    readLengths   = []
    eltLenDict    = None
    lastConfig    = None

    for sampleLine in sampleList:
        if not re.search("^#", sampleLine):
            (sampleBam,sampleSubDir,refGenome,groupName) = sampleLine.strip().split()
            peakparser.checkOutDir(sampleSubDir,args.outDirName)
            configPath = args.outDirName + "/" + sampleSubDir + "/" + args.configFileName

            discordant.checkfile(configPath)
            configDict = peakparser.readConfig(configPath,sampleSubDir,args.outDirName)
            eltLenDict = discordant.makeEltLenDict(configDict['eltLenFileName'])
            lastConfig = configDict

            insertSizes.append(int(configDict['insertSize']))
            readLengths.append(int(configDict['readLength']))

            readFileName = args.outDirName + "/" + sampleSubDir + "/" + sampleSubDir + ".readpairs.txt"
            readFileNames.append(readFileName)
            bamFileNames.append(configDict['bamFileName'])
            sampleNames.append(sampleSubDir)

    maxDist = max(insertSizes) + 2*max(readLengths)        

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    mergeChrPosFiles(readFileNames,outReadFileName,maxDist,eltLenDict)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = lastConfig

    assert len(bamFileNames) == len(sampleNames)

    bfnum = 0
    for bamFileName in bamFileNames:
        bfvname = "bamFileName" + str(bfnum)
        configDict[bfvname] = bamFileName
        bfnum += 1

    snum = 0
    for sampleName in sampleNames:
        samname = "sampleName" + str(snum)
        configDict[samname] = sampleName
        snum += 1

    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k,v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()