Example #1
0
def RandSequential(opts, initPath, allProts, sampleMap, potentialSteinerMap,
                   dummyNeighborMap, lastForestMap, countMap, weightedPrizes,
                   negativePrizes, degPenalties):
    print "Learning forests in random sequential mode"

    # Iterate (rounds 2+)
    itrPath = initPath
    for itr in range(2, opts.iterations + 1):
        #lastPath = itrPath
        itrPath = os.path.join(opts.resultPath, "itr%d" % itr)
        if not os.path.exists(itrPath):
            os.makedirs(itrPath)

        # Only constrain the Steiner forests to be similar to other samples in the same group
        for group in sampleMap.iterkeys():
            sampleNames = sampleMap[group]
            numSamples = countMap[group]
            potentialSteiner = potentialSteinerMap[group]
            dummyNeighborFiles = dummyNeighborMap[group]
            lastForests = lastForestMap[group]

            if len(sampleNames) != numSamples or len(
                    potentialSteiner) != numSamples or len(
                        dummyNeighborFiles) != numSamples or len(
                            lastForests) != numSamples:
                raise RuntimeError(
                    "Must have the same number of samples in group %s" % group)

            # Randomly choose the order in which to learn forests at this iteration
            order = range(numSamples)
            random.shuffle(order)

            # Write the order to a file
            with open(os.path.join(itrPath, "sampleOrder_%s.txt" % group),
                      "w") as f:
                for index in order:
                    f.write("%d\t%s\n" % (index, sampleNames[index]))

            # Iterate over all samples in the random order
            for index in order:
                # Create artificial prizes for this sample using all N-1 lastForests
                otherLastForests = list(lastForests)
                otherLastForests.pop(index)
                if weightedPrizes:
                    # lambda2 is used as the alpha parameter
                    artificialPrizes = CreateWgtPrizes(allProts,
                                                       otherLastForests,
                                                       opts.lambda1,
                                                       opts.lambda2,
                                                       negativePrizes)
                else:
                    # Use all N-1 other sets of potential Steiner nodes
                    otherPotentialSteiner = list(potentialSteiner)
                    otherPotentialSteiner.pop(index)
                    artificialPrizes = CreateUnwgtPrizes(
                        allProts, otherPotentialSteiner, otherLastForests,
                        opts.lambda1, opts.lambda2, negativePrizes)
                NetworkUtil.WriteDict(
                    os.path.join(
                        itrPath,
                        "%s_artificialPrizes.txt" % sampleNames[index]),
                    artificialPrizes)

                # Update the stp file based on the artificial prizes and degree penalties and copy the dummy neighbors
                UpdateStp(artificialPrizes, degPenalties,
                          potentialSteiner[index], initPath, itrPath,
                          sampleNames[index])
                shutil.copyfile(
                    os.path.join(initPath, dummyNeighborFiles[index]),
                    os.path.join(itrPath, dummyNeighborFiles[index]))

                # Learn a new forest for this sample and update lastForests
                # All samples (besides the first and last in the random order) will use last forests
                # that are a mix of forests from this iteration and the previous iteration
                LearnSteiner(opts, itrPath, itrPath, sampleNames[index],
                             dummyNeighborFiles[index], opts.workers)
                lastForests[index] = LoadForestNodes(
                    "%s/symbol_%s_%s_1.0_%d.txt" %
                    (itrPath, sampleNames[index], str(opts.W), opts.depth))
            # Store all forests learned for this group at this iteration so they can be
            # retreived at the next iteration
            lastForestMap[group] = lastForests

    return itrPath
Example #2
0
def Batch(opts, pool, initPath, allProts, sampleMap, potentialSteinerMap,
          dummyNeighborMap, lastForestMap, countMap, weightedPrizes,
          negativePrizes, degPenalties):
    print "Learning forests in parallel batch mode"

    # Iterate (rounds 2+)
    itrPath = initPath
    for itr in range(2, opts.iterations + 1):
        #lastPath = itrPath
        itrPath = os.path.join(opts.resultPath, "itr%d" % itr)
        if not os.path.exists(itrPath):
            os.makedirs(itrPath)

        # Only constrain the Steiner forests to be similar to other samples in the same group
        for group in sampleMap.iterkeys():
            sampleNames = sampleMap[group]
            numSamples = countMap[group]
            potentialSteiner = potentialSteinerMap[group]
            dummyNeighborFiles = dummyNeighborMap[group]
            lastForests = lastForestMap[group]

            if len(sampleNames) != numSamples or len(
                    potentialSteiner) != numSamples or len(
                        dummyNeighborFiles) != numSamples or len(
                            lastForests) != numSamples:
                raise RuntimeError(
                    "Must have the same number of samples in group %s" % group)

            # Update artificial prizes based on the forests from the previous iteration
            if weightedPrizes:
                # lambda2 is used as the alpha parameter
                artificialPrizes = CreateWgtPrizes(allProts, lastForests,
                                                   opts.lambda1, opts.lambda2,
                                                   negativePrizes)
            else:
                artificialPrizes = CreateUnwgtPrizes(allProts,
                                                     potentialSteiner,
                                                     lastForests, opts.lambda1,
                                                     opts.lambda2,
                                                     negativePrizes)
            NetworkUtil.WriteDict(
                os.path.join(itrPath, "artificialPrizes_%s.txt" % group),
                artificialPrizes)
            print "%d artificial prizes in group %s at iteration %d" % (
                len(artificialPrizes), group, itr)

            # Update the stp files based on the new artificial prizes and degree penalties
            # and copy the potential Steiner node files, which need to be in itrPath
            for i in range(numSamples):
                UpdateStp(artificialPrizes, degPenalties, potentialSteiner[i],
                          initPath, itrPath, sampleNames[i])
                shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]),
                                os.path.join(itrPath, dummyNeighborFiles[i]))

            # Learn new Steiner forests in parallel
            zippedArgs = itertools.izip(itertools.repeat(opts),
                                        itertools.repeat(itrPath),
                                        itertools.repeat(itrPath),
                                        sampleNames, dummyNeighborFiles,
                                        itertools.repeat(1))
            pool.map(LearnSteinerHelper, zippedArgs)
            lastForests = []
            for i in range(numSamples):
                lastForests.append(
                    LoadForestNodes(
                        "%s/symbol_%s_%s_1.0_%d.txt" %
                        (itrPath, sampleNames[i], str(opts.W), opts.depth)))
            lastForestMap[group] = lastForests

    return itrPath
Example #3
0
def main(argList):
    # Parse the arguments, which either come from the command line or a list
    # provided by the Python code calling this function
    parser = CreateParser()
    (opts, args) = parser.parse_args(argList)

    print "Starting constrained multi-sample Steiner forest %s" % time.strftime(
        "%a, %d %b %Y %H:%M:%S", time.localtime())
    print "Multi-PCSF version %s" % __version__
    print "Parameters: %s" % opts

    # TODO Add error checking of inputs
    if opts.iterations < 1:
        raise RuntimeError("Must have at least 1 iteration")
    # TODO Should allow the option to run serially without the pool because a
    # pool with 1 worker is not efficient
    if opts.workers < 1:
        opts.workers = multiprocessing.cpu_count()

    # Assume negative prizes to implement the common set
    # and change if using positive common set prizes
    negativePrizes = True
    if "positive" in opts.artificialPrizes:
        negativePrizes = False

    # Assume unweighted prizes
    weightedPrizes = False
    if "Weighted" in opts.artificialPrizes:
        weightedPrizes = True

    # Assume batch mode
    batchMode = True
    if opts.iterMode == "random":
        batchMode = False

    # Load all of the proteins in the interactome, ignoring
    # genes.  The artificial prizes will be created for a subset of these nodes.
    allProts = LoadProteins(opts.interactomePath, opts.undirectedFile,
                            opts.directedFile, opts.tfdnaFile)

    # Load the negative prizes for the degree penalties or an empty dictionary
    # if they aren't being used
    directedFile = "None"
    if opts.directedFile != "None":
        directedFile = os.path.join(opts.interactomePath, opts.directedFile)
    degPenalties = NetworkUtil.DegreePenalties(
        opts.mu, os.path.join(opts.interactomePath, opts.undirectedFile),
        directedFile)

    # Create the initial stp files
    # New directory to hold the original data before the iterations begin
    # These stp files will be read and updated at subsequent iterations
    initPath = os.path.join(opts.resultPath, "initial")
    if not os.path.exists(initPath):
        os.makedirs(initPath)

    # Load the list of terminal files and the sample-to-group mapping
    terminalMap, sampleMap, countMap = LoadTerminalFiles(
        opts.terminalPath, opts.masterTerminalFile)
    # Store the groups in a fixed order
    groups = sorted(terminalMap.iterkeys())
    for group in groups:
        print "%d samples in group %s" % (countMap[group], group)

    # Create a pool for creating .stp files and learning Steiner forests in parallel
    # using the specified number of workers.  Use it to create the initial
    # .stp files.  Even when running the subsequent iterations in random sequential
    # order, create a pool to learn the initial trees and final pruned trees (if applicable).
    print "Creating a pool with %d workers" % opts.workers
    pool = multiprocessing.Pool(opts.workers)
    initialStpMap = dict()
    for group in groups:
        terminalFiles = terminalMap[group]
        sampleNames = sampleMap[group]
        # opts and initPath are invariant arguments for each sample
        zippedArgs = itertools.izip(itertools.repeat(opts),
                                    itertools.repeat(initPath), terminalFiles,
                                    sampleNames)
        initialStpMap[group] = pool.map(
            CreateStpHelper, zippedArgs)  # Blocks until all are finished

    # Store which proteins don't have prizes for each patient.
    # These are the nodes that could potentially be Steiner nodes for
    # each sample.  This can't be recovered from the stp files at later
    # iterations because both original prizes and artificial prizes will exist.
    # Also track how the dummy node will be connected
    # to the networks, either all prizes or all non-prizes (potential Steiner nodes)
    potentialSteinerMap = dict()
    dummyNeighborMap = dict()
    for group in groups:
        numSamples = countMap[group]
        sampleNames = sampleMap[group]
        initialStps = initialStpMap[group]
        potentialSteiner = []  # A list of sets
        dummyNeighborFiles = []  # A list of filenames
        for i in range(numSamples):
            dnFile = sampleNames[i] + "_dummyNeighbors.txt"
            dummyNeighborFiles.append(dnFile)
            potentialSteiner.append(
                DummyNeighbors(allProts, initPath, initialStps[i], dnFile,
                               opts.dummyNeighbors))
        potentialSteinerMap[group] = potentialSteiner
        dummyNeighborMap[group] = dummyNeighborFiles

    itrPath = os.path.join(opts.resultPath, "itr1")
    if not os.path.exists(itrPath):
        os.makedirs(itrPath)

    # Initialize the artificial prizes to be an empty dictionary so that
    # we learn the initial trees independently
    artificialPrizes = dict()
    # Write the unused itr1 artificial prizes so that the files exist for post-processing
    for group in groups:
        NetworkUtil.WriteDict(
            os.path.join(itrPath, "artificialPrizes_%s.txt" % group),
            artificialPrizes)
    print "%d artificial prizes at iteration 1" % len(artificialPrizes)

    # Add the degree penalties to the initial stp files.  Pass in the empty artificial prize
    # dictionary, which won't have an effect.
    for group in groups:
        sampleNames = sampleMap[group]
        numSamples = countMap[group]
        potentialSteiner = potentialSteinerMap[group]
        dummyNeighborFiles = dummyNeighborMap[group]
        for i in range(numSamples):
            # Copy the dummy neighbors, which must be in the same directory as the stp file
            UpdateStp(artificialPrizes, degPenalties, potentialSteiner[i],
                      initPath, itrPath, sampleNames[i])
            shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]),
                            os.path.join(itrPath, dummyNeighborFiles[i]))

    # Learn the first iteration Steiner forests in parallel
    # Run single-threaded belief propagation when using the worker pool
    lastForestMap = dict()
    for group in groups:
        numSamples = countMap[group]
        sampleNames = sampleMap[group]
        dummyNeighborFiles = dummyNeighborMap[group]
        zippedArgs = itertools.izip(itertools.repeat(opts),
                                    itertools.repeat(itrPath),
                                    itertools.repeat(itrPath), sampleNames,
                                    dummyNeighborFiles, itertools.repeat(1))
        pool.map(LearnSteinerHelper, zippedArgs)
        lastForests = [
        ]  # A list of sets, where each set contains the Steiner forest nodes
        for i in range(numSamples):
            lastForests.append(
                LoadForestNodes(
                    "%s/symbol_%s_%s_1.0_%d.txt" %
                    (itrPath, sampleNames[i], str(opts.W), opts.depth)))
        lastForestMap[group] = lastForests

    # Learn the forests at all remaining iterations and return the directory
    # that contains the forests from the last iteration.
    if opts.iterations > 1:
        if batchMode:
            itrPath = Batch(opts, pool, initPath, allProts, sampleMap,
                            potentialSteinerMap, dummyNeighborMap,
                            lastForestMap, countMap, weightedPrizes,
                            negativePrizes, degPenalties)
        else:
            itrPath = RandSequential(opts, initPath, allProts, sampleMap,
                                     potentialSteinerMap, dummyNeighborMap,
                                     lastForestMap, countMap, weightedPrizes,
                                     negativePrizes, degPenalties)

    # Prune Steiner nodes from the forests that are not used to reach any prizes and
    # are only present because they were in the common set.
    # This is not necessary if only 1 iteration was run because in that case there
    # is no common set.
    # It is also not necessary if negative prizes were used.
    if opts.iterations > 1 and (not negativePrizes):
        print "Learning final forests"
        print "Pruning forests from %s" % itrPath
        finalPath = os.path.join(opts.resultPath, "final")
        if not os.path.exists(finalPath):
            os.makedirs(finalPath)

        # Nothing is returned by these operations so they can be performed
        # simultaneously independent of the groupings
        sampleNames = FlattenDict(sampleMap, groups)
        dummyNeighborFiles = FlattenDict(dummyNeighborMap, groups)
        potentialSteiner = FlattenDict(potentialSteinerMap, groups)

        for i in range(len(sampleNames)):
            forestFile = "%s/symbol_%s_%s_1.0_%d.txt" % (
                itrPath, sampleNames[i], str(opts.W), opts.depth)
            FilterStpEdges(forestFile, initPath, finalPath, sampleNames[i],
                           degPenalties, potentialSteiner[i])
            shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]),
                            os.path.join(finalPath, dummyNeighborFiles[i]))

        zippedArgs = itertools.izip(itertools.repeat(opts),
                                    itertools.repeat(finalPath),
                                    itertools.repeat(finalPath), sampleNames,
                                    dummyNeighborFiles, itertools.repeat(1))
        pool.map(LearnSteinerHelper, zippedArgs)

    print "Finishing constrained multi-sample Steiner forest %s" % time.strftime(
        "%a, %d %b %Y %H:%M:%S", time.localtime())

    pool.close()