예제 #1
0
def main(argList):
    # Parse the arguments, which either come from the command line or a list
    # provided by the Python code calling this function
    parser = CreateParser()
    (opts, args) = parser.parse_args(argList)

    print "Starting constrained multi-sample Steiner forest %s" % time.strftime(
        "%a, %d %b %Y %H:%M:%S", time.localtime())
    print "Multi-PCSF version %s" % __version__
    print "Parameters: %s" % opts

    # TODO Add error checking of inputs
    if opts.iterations < 1:
        raise RuntimeError("Must have at least 1 iteration")
    # TODO Should allow the option to run serially without the pool because a
    # pool with 1 worker is not efficient
    if opts.workers < 1:
        opts.workers = multiprocessing.cpu_count()

    # Assume negative prizes to implement the common set
    # and change if using positive common set prizes
    negativePrizes = True
    if "positive" in opts.artificialPrizes:
        negativePrizes = False

    # Assume unweighted prizes
    weightedPrizes = False
    if "Weighted" in opts.artificialPrizes:
        weightedPrizes = True

    # Assume batch mode
    batchMode = True
    if opts.iterMode == "random":
        batchMode = False

    # Load all of the proteins in the interactome, ignoring
    # genes.  The artificial prizes will be created for a subset of these nodes.
    allProts = LoadProteins(opts.interactomePath, opts.undirectedFile,
                            opts.directedFile, opts.tfdnaFile)

    # Load the negative prizes for the degree penalties or an empty dictionary
    # if they aren't being used
    directedFile = "None"
    if opts.directedFile != "None":
        directedFile = os.path.join(opts.interactomePath, opts.directedFile)
    degPenalties = NetworkUtil.DegreePenalties(
        opts.mu, os.path.join(opts.interactomePath, opts.undirectedFile),
        directedFile)

    # Create the initial stp files
    # New directory to hold the original data before the iterations begin
    # These stp files will be read and updated at subsequent iterations
    initPath = os.path.join(opts.resultPath, "initial")
    if not os.path.exists(initPath):
        os.makedirs(initPath)

    # Load the list of terminal files and the sample-to-group mapping
    terminalMap, sampleMap, countMap = LoadTerminalFiles(
        opts.terminalPath, opts.masterTerminalFile)
    # Store the groups in a fixed order
    groups = sorted(terminalMap.iterkeys())
    for group in groups:
        print "%d samples in group %s" % (countMap[group], group)

    # Create a pool for creating .stp files and learning Steiner forests in parallel
    # using the specified number of workers.  Use it to create the initial
    # .stp files.  Even when running the subsequent iterations in random sequential
    # order, create a pool to learn the initial trees and final pruned trees (if applicable).
    print "Creating a pool with %d workers" % opts.workers
    pool = multiprocessing.Pool(opts.workers)
    initialStpMap = dict()
    for group in groups:
        terminalFiles = terminalMap[group]
        sampleNames = sampleMap[group]
        # opts and initPath are invariant arguments for each sample
        zippedArgs = itertools.izip(itertools.repeat(opts),
                                    itertools.repeat(initPath), terminalFiles,
                                    sampleNames)
        initialStpMap[group] = pool.map(
            CreateStpHelper, zippedArgs)  # Blocks until all are finished

    # Store which proteins don't have prizes for each patient.
    # These are the nodes that could potentially be Steiner nodes for
    # each sample.  This can't be recovered from the stp files at later
    # iterations because both original prizes and artificial prizes will exist.
    # Also track how the dummy node will be connected
    # to the networks, either all prizes or all non-prizes (potential Steiner nodes)
    potentialSteinerMap = dict()
    dummyNeighborMap = dict()
    for group in groups:
        numSamples = countMap[group]
        sampleNames = sampleMap[group]
        initialStps = initialStpMap[group]
        potentialSteiner = []  # A list of sets
        dummyNeighborFiles = []  # A list of filenames
        for i in range(numSamples):
            dnFile = sampleNames[i] + "_dummyNeighbors.txt"
            dummyNeighborFiles.append(dnFile)
            potentialSteiner.append(
                DummyNeighbors(allProts, initPath, initialStps[i], dnFile,
                               opts.dummyNeighbors))
        potentialSteinerMap[group] = potentialSteiner
        dummyNeighborMap[group] = dummyNeighborFiles

    itrPath = os.path.join(opts.resultPath, "itr1")
    if not os.path.exists(itrPath):
        os.makedirs(itrPath)

    # Initialize the artificial prizes to be an empty dictionary so that
    # we learn the initial trees independently
    artificialPrizes = dict()
    # Write the unused itr1 artificial prizes so that the files exist for post-processing
    for group in groups:
        NetworkUtil.WriteDict(
            os.path.join(itrPath, "artificialPrizes_%s.txt" % group),
            artificialPrizes)
    print "%d artificial prizes at iteration 1" % len(artificialPrizes)

    # Add the degree penalties to the initial stp files.  Pass in the empty artificial prize
    # dictionary, which won't have an effect.
    for group in groups:
        sampleNames = sampleMap[group]
        numSamples = countMap[group]
        potentialSteiner = potentialSteinerMap[group]
        dummyNeighborFiles = dummyNeighborMap[group]
        for i in range(numSamples):
            # Copy the dummy neighbors, which must be in the same directory as the stp file
            UpdateStp(artificialPrizes, degPenalties, potentialSteiner[i],
                      initPath, itrPath, sampleNames[i])
            shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]),
                            os.path.join(itrPath, dummyNeighborFiles[i]))

    # Learn the first iteration Steiner forests in parallel
    # Run single-threaded belief propagation when using the worker pool
    lastForestMap = dict()
    for group in groups:
        numSamples = countMap[group]
        sampleNames = sampleMap[group]
        dummyNeighborFiles = dummyNeighborMap[group]
        zippedArgs = itertools.izip(itertools.repeat(opts),
                                    itertools.repeat(itrPath),
                                    itertools.repeat(itrPath), sampleNames,
                                    dummyNeighborFiles, itertools.repeat(1))
        pool.map(LearnSteinerHelper, zippedArgs)
        lastForests = [
        ]  # A list of sets, where each set contains the Steiner forest nodes
        for i in range(numSamples):
            lastForests.append(
                LoadForestNodes(
                    "%s/symbol_%s_%s_1.0_%d.txt" %
                    (itrPath, sampleNames[i], str(opts.W), opts.depth)))
        lastForestMap[group] = lastForests

    # Learn the forests at all remaining iterations and return the directory
    # that contains the forests from the last iteration.
    if opts.iterations > 1:
        if batchMode:
            itrPath = Batch(opts, pool, initPath, allProts, sampleMap,
                            potentialSteinerMap, dummyNeighborMap,
                            lastForestMap, countMap, weightedPrizes,
                            negativePrizes, degPenalties)
        else:
            itrPath = RandSequential(opts, initPath, allProts, sampleMap,
                                     potentialSteinerMap, dummyNeighborMap,
                                     lastForestMap, countMap, weightedPrizes,
                                     negativePrizes, degPenalties)

    # Prune Steiner nodes from the forests that are not used to reach any prizes and
    # are only present because they were in the common set.
    # This is not necessary if only 1 iteration was run because in that case there
    # is no common set.
    # It is also not necessary if negative prizes were used.
    if opts.iterations > 1 and (not negativePrizes):
        print "Learning final forests"
        print "Pruning forests from %s" % itrPath
        finalPath = os.path.join(opts.resultPath, "final")
        if not os.path.exists(finalPath):
            os.makedirs(finalPath)

        # Nothing is returned by these operations so they can be performed
        # simultaneously independent of the groupings
        sampleNames = FlattenDict(sampleMap, groups)
        dummyNeighborFiles = FlattenDict(dummyNeighborMap, groups)
        potentialSteiner = FlattenDict(potentialSteinerMap, groups)

        for i in range(len(sampleNames)):
            forestFile = "%s/symbol_%s_%s_1.0_%d.txt" % (
                itrPath, sampleNames[i], str(opts.W), opts.depth)
            FilterStpEdges(forestFile, initPath, finalPath, sampleNames[i],
                           degPenalties, potentialSteiner[i])
            shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]),
                            os.path.join(finalPath, dummyNeighborFiles[i]))

        zippedArgs = itertools.izip(itertools.repeat(opts),
                                    itertools.repeat(finalPath),
                                    itertools.repeat(finalPath), sampleNames,
                                    dummyNeighborFiles, itertools.repeat(1))
        pool.map(LearnSteinerHelper, zippedArgs)

    print "Finishing constrained multi-sample Steiner forest %s" % time.strftime(
        "%a, %d %b %Y %H:%M:%S", time.localtime())

    pool.close()