Esempio n. 1
0
def main():
    filenames = os.listdir(conf.inputFolder)
    util.generateDirectories(conf.resultFolder)
    with open(os.path.join(conf.resultFolder, "Results.txt"), "w") as wf:
        wf.write("TEvo\tNFam\tNFusions\ts%\ts\tlen\ts%_f\ts_f\tlen_f\n")
        for filename in filenames:
            # reference:
            # 0 1    2    3    4    5 6        7 8    9   10   11 12
            # M_mjtt_SeqL_1000_NFam_2_NFusions_2_TEvo_1.5_NGen_5_ BorderInformation
            parsed = filename.split("_")
            # model = parsed[1]
            # seqLen = parsed[3]
            NFam = parsed[5]
            NFusions = parsed[7]
            TEvo = parsed[9]
            # NGen = parsed[11]

            borderDict = load(
                open(os.path.join(conf.inputFolder, filename), "rb"))
            successRatio, success, totalBorders = borderAnalysis.analyizeBorder(
                borderDict)
            successRatio2, success2, totalBorders2 = borderAnalysis.analyizeBorder(
                borderDict, True)

            wf.write(
                str(TEvo) + "\t" + str(NFam) + "\t" + str(NFusions) + "\t")
            wf.write(
                str(successRatio) + "\t" + str(successRatio) + "\t" +
                str(totalBorders) + "\t")
            wf.write(
                str(successRatio2) + "\t" + str(successRatio2) + "\t" +
                str(totalBorders2) + "\n")
def main():
    borderFiles = os.listdir(conf.addaResultFolder)

    for i, borderFile in enumerate(borderFiles):
        resultDict = {}
        nidToPidPath = os.path.join(conf.nidToPidFolder, borderFile)
        nidToPidDict = load(open(nidToPidPath, "rb"))

        addaDir = os.path.join(conf.addaResultFolder, borderFile)
        with open(addaDir, "r") as f:
            for line in f:
                arr = line.split("\t")
                nid = int(arr[0].strip())
                s = int(arr[1].strip())
                e = int(arr[2].strip())
                pid = nidToPidDict[nid]

                if pid in resultDict.keys():
                    # adda did not provide domain family info, so I just put '1' here
                    resultDict[pid].append([1, s, e])
                else:
                    resultDict[pid] = [[1, s, e]]

        util.generateDirectories(conf.resultFolder)
        with open(os.path.join(conf.resultFolder, borderFile + ".cpickle"),
                  "wb") as f:
            dump(resultDict, f)
Esempio n. 3
0
def main():
    seqFiles = os.listdir(conf.inputFolder)
    # generate runtime output file
    util.generateDirectories(conf.runTimefolder)
    open(conf.runTimeFile, "w")
    for i, seqFile in enumerate(seqFiles):
        util.printL("\nAnalyzing " + seqFile + "(" + str(i) + "/" +
                    str(len(seqFiles)) + ")\n")
        runAlg(seqFile)
Esempio n. 4
0
def main():
    seqFiles = os.listdir(conf.inputFolder)
    # generate runtime output file
    util.generateDirectories(conf.runTimefolder)
    with open(conf.runTimeFile, "w") as f:
        f.write("numBLAST_Align\tgraphSize\ttime\tfilename\n")
    for i, seqFile in enumerate(seqFiles):
        util.printL("\nAnalyzing "+seqFile+"("+str(i)+"/"+str(len(seqFiles))+")\n")
        runAlg(seqFile)
def filterInput():
    util.generateDirectories(conf.filteredDir)
    filenames = os.listdir(conf.inputFolder)
    for filename in filenames:
        with open(os.path.join(conf.filteredDir, filename), "w") as wf, \
                open(os.path.join(conf.inputFolder, filename), "r") as rf:
            for line in rf:
                arr = line.split("\t")
                if arr[0] != arr[1]:
                    wf.write(line)
def main():
    filterInput()

    filenames = os.listdir(conf.filteredDir)

    util.generateDirectories(conf.resultFolder)

    for filename in filenames:
        fileDir = os.path.join(conf.filteredDir, filename)

        cmd = [conf.mosaicFinderDir, "-e", "1e-05", "-p", "30", "-i", fileDir, "-o", os.path.join(conf.resultFolder, filename)]

        proc = subprocess.Popen(cmd)

    return 0
def compareSplitPoints():
    borderFiles = os.listdir(conf.bordersFolder)

    util.generateDirectories(conf.resultFolder)
    with open(conf.splitResultFile, "w") as resultFile:
        resultFile.write(
            "testcase\tavgDist\twrongNumBordersOnSplitProt\tmultBorderOnFamilyProtein\n"
        )
        for borderFile in borderFiles:
            borderDir = os.path.join(conf.bordersFolder, borderFile)
            borderDict = load(open(borderDir, "rb"))

            # fusionInfoFile = borderFile.replace(conf.bordersAppend, conf.fusionInfoAppend)
            # fusionInfoDir = os.path.join(conf.fusionInfoFolder, fusionInfoFile)
            #
            # fusionDict = readFusionInfo(fusionInfoDir)

            avgDist, wrongNumBordersOnSplitProt, multBorderOnFamilyProtein = func.compareSplitPoints(
                borderDict)

            testcase = borderFile.replace(conf.bordersAppend, "")
            resultFile.write(testcase + "\t" + str(avgDist) + "\t" +
                             str(wrongNumBordersOnSplitProt) + "\t" +
                             str(multBorderOnFamilyProtein) + "\n")
Esempio n. 8
0
def runAlg(filename):
    # fam_name=family[1]
    # numprot=family[0]

    # # generate fasta sequence
    # outfolder = conf.fastaFolder
    # util.generateDirectories(outfolder)
    # # filename = str(numprot) + "_" + fam_name
    #
    # print " Generating sequence file..."
    # outdir = os.path.join(outfolder, filename)
    # GenFasta.GenerateFastaInputForMultiFamilies(FamNames, outdir)

    if not conf.skipBLAST:
        # generate protein lengths
        plenFolder = conf.proteinLenFolder
        util.generateDirectories(plenFolder)
        plenDict = blast.generateProtLenDict(conf.fastaFolder, filename)
        dump(plenDict, open(os.path.join(plenFolder, filename), "wb"))

        # create blast databases
        print " Conducting BLASTp all-to-all..."
        blast.makeblastdb(conf.fastaFolder, filename)

        # conduct all to all BLASTp
        alltoallFolder = conf.alltoallFolder
        util.generateDirectories(alltoallFolder)
        blast.alltoallBlastP(conf.fastaFolder, filename,
                             os.path.join(alltoallFolder, filename))

    # This is where my algorithm starts and also where I'll start timing
    print " Conducting my algorithm..."
    startTime = time.time()
    # build HSPIntGraph
    seqSimGraph, numBlastLines, numIntEdge = buildGraph.build_graph(
        filename, conf.alltoallFolder)

    # identify protein module borders
    # putative domains
    numModules, moduleFamilyInfo = findBorders.generatePutativeModules(
        seqSimGraph)
    putativeResult = vis.visualizeModuleFamilyInfo(moduleFamilyInfo)

    # remove submodules
    findBorders.removeSuperModules(moduleFamilyInfo)
    moduleResult = vis.visualizeModuleFamilyInfo(moduleFamilyInfo)
    # print moduleResult

    # rename modules to have lower numbers
    numModulesAfterFilter = findBorders.renameModules(moduleFamilyInfo)
    moduleResultRenamed = vis.visualizeModuleFamilyInfo(moduleFamilyInfo)

    endTime = time.time()
    # calculate elapsed time
    timediff = endTime - startTime

    # output the results
    util.generateDirectories(conf.textResultsFolder)

    consizePath = os.path.join(conf.textResultsFolder,
                               filename + "_Modules.txt")
    with open(consizePath, "w") as f:
        f.write(moduleResultRenamed)

    detailedPath = os.path.join(conf.textResultsFolder,
                                filename + "_detailedResults.txt")
    with open(detailedPath, "w") as f:
        f.write("number of Blast Edges: " + str(numBlastLines) + "\n")
        f.write("time elapsed: " + str(timediff))
        util.printL("Completed in " + str(timediff) + " seconds\n")
        f.write("number of IntervalEdges added: " + str(numIntEdge) + "\n")
        f.write("Putative Modules: " + str(numModules) + "\n" +
                putativeResult + "\n")
        f.write("RemoveSuperModules: \n" + moduleResult + "\n")
        f.write("Final Module Definition: " + str(numModulesAfterFilter) +
                "\n" + moduleResultRenamed + "\n")

    # Write down the timing results
    with open(conf.runTimeFile, "a") as f:
        f.write(
            str(numBlastLines) + "\t" + str(timediff) + "\t" + filename + "\n")

    # # compare the borders with pfam definitions side by side
    # pFamDict = pfamComp.correspondingPFamDict(moduleFamilyInfo)
    # pfamCompPath = os.path.join(conf.textResultsFolder, filename + "_pFamSideBySide.txt")
    # with open(pfamCompPath, "w") as f:
    #     f.write(vis.visualizePFamComparison(moduleFamilyInfo,pFamDict))
    #

    # dump the border files for future comparison
    util.generateDirectories(conf.pickledResultsFolder)
    myBordersPath = os.path.join(conf.pickledResultsFolder,
                                 filename + "_myBorders.cpickle")
    # pFamBordersPath = os.path.join(conf.pickledResultsFolder, filename + "_pfamBorders.cpickle")
    dump(moduleFamilyInfo, open(myBordersPath, "wb"))
    # dump(pFamDict, open(pFamBordersPath, "wb"))

    # remove extra folders to safe disk space
    if conf.deleteFolders:
        shutil.rmtree(conf.proteinLenFolder)
        shutil.rmtree(conf.blastdbFolder)
        shutil.rmtree(conf.alltoallFolder)
def convertResultToCPickleDict():
    util.generateDirectories(conf.cPickleDir)
    filenames = os.listdir(conf.resultFolder)
    for filename in filenames:
        with open(os.path.join(conf.resultFolder, filename), "r") as f:
            for line in f:
def main():
    # print "\n0_BlastAllToAll:"
    # BlastAllToAll.main()
    # print "\n1_ConvertToADDA: "
    # ConvertToADDA.main()
    # print "\n2_rcm_module_tree:"
    # rcm_module_tree.main()
    # print "\n3_RunADDA:"
    # RunADDA.main(recompile=conf.recompileADDA)

    if conf.recompileADDA:
        print "make clean and making ADDA"
        # make clean the software
        cmd = ["make", "clean", "-C", conf.addaFolder]
        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        proc.communicate()

        # make the software
        cmd = ["make", "-C", conf.addaFolder]
        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        proc.communicate()

    # import all of the filenames
    inputfilenames = os.listdir(conf.seqFolder)
    inputfilenames.sort()

    # create the file for timing information
    util.generateDirectories(conf.timingFolder)
    timingFileDir = os.path.join(conf.timingFolder, conf.timingFile)
    with open(timingFileDir, "w") as f:
        f.write("Time\tSampleName\n")
    # reset the log file
    open(os.path.join(conf.logFolder, conf.logFile), "w")

    for i, inputfilename in enumerate(inputfilenames):
        util.printL("\nProcessing " + inputfilename + " (" + str(i) + "/" +
                    str(len(inputfilenames)) + ")\n")

        util.printL("0_BlastAllToAll:\n")
        BlastAllToAll.main(inputfilename)

        util.printL("1_ConvertToADDA:\n")
        ConvertToADDA.main(inputfilename)

        # start timer
        startTime = time.time()
        util.printL("\n2_rcm_module_tree:\n")
        rcm_module_tree.main(inputfilename)
        util.printL("3_RunADDA:\n")
        RunADDA.main(inputfilename)

        # end timer
        endTime = time.time()
        timeDiff = endTime - startTime
        util.printL("rcm and adda completed in " + str(timeDiff) +
                    " seconds.\n")

        with open(timingFileDir, "a") as f:
            f.write(str(timeDiff) + "\t" + inputfilename + "\n")
def main():
    filenames = os.listdir(conf.inputFolder)
    util.generateDirectories(conf.resultFolder)
    with open(os.path.join(conf.resultFolder, "Results.txt"), "w") as wf:
        wf.write("TEvo\tNFam\tNFusions\tavgConf\tnumProt\n")
        for filename in filenames:
            # reference:
            # 0 1    2    3    4    5 6        7 8    9   10   11 12
            # M_mjtt_SeqL_1000_NFam_2_NFusions_2_TEvo_1.5_NGen_5_ BorderInformation
            parsed = filename.split("_")
            # model = parsed[1]
            # seqLen = parsed[3]
            NFam = parsed[5]
            NFusions = parsed[7]
            TEvo = parsed[9]
            # NGen = parsed[11]

            fusedDict, familyDict = readResultFile(conf.inputFolder, filename)

            confidenceArr = []

            for pid in fusedDict.keys():
                fEvent, f1, f2 = fusedDict[pid]

                mf1s = familyDict[fEvent][1]
                mf2s = familyDict[fEvent][2]
                success = 0
                totalAssigns = len(mf1s) + len(mf2s)
                print pid
                print mf1s
                print mf2s
                #print familyDict[fEvent][3]
                success1 = 0
                success2 = 0
                for f in mf1s:
                    if f == f1:
                        success1 += 1
                for f in mf2s:
                    if f == f1:
                        success2 += 1
                if success1 > success2:
                    mf1 = mf1s
                    mf2 = mf2s
                else:
                    mf1 = mf2s
                    mf2 = mf1s

                for f in mf1:
                    if f == f1:
                        success1 += 1
                for f in mf2:
                    if f == f2:
                        success2 += 1

                # if totalAssigns == 0:
                #     confi = 0
                # else:
                #     confi = float(success)/totalAssigns

                if success1 > 0 and success2 > 0:
                    confi = 1
                elif success1 == 0 and success2 == 0:
                    confi = 0
                else:
                    confi = .5

                confidenceArr.append(confi)
                print confi, success, totalAssigns

            avgConf = reduce(lambda x, y: x + y, confidenceArr) / float(
                len(confidenceArr))

            wf.write(
                str(TEvo) + "\t" + str(NFam) + "\t" + str(NFusions) + "\t")
            wf.write(str(avgConf) + "\t" + str(len(fusedDict.keys())) + "\n")