def main(argList): # Parse the arguments, which either come from the command line or a list # provided by the Python code calling this function parser = CreateParser() (opts, args) = parser.parse_args(argList) print "Starting constrained multi-sample Steiner forest %s" % time.strftime( "%a, %d %b %Y %H:%M:%S", time.localtime()) print "Multi-PCSF version %s" % __version__ print "Parameters: %s" % opts # TODO Add error checking of inputs if opts.iterations < 1: raise RuntimeError("Must have at least 1 iteration") # TODO Should allow the option to run serially without the pool because a # pool with 1 worker is not efficient if opts.workers < 1: opts.workers = multiprocessing.cpu_count() # Assume negative prizes to implement the common set # and change if using positive common set prizes negativePrizes = True if "positive" in opts.artificialPrizes: negativePrizes = False # Assume unweighted prizes weightedPrizes = False if "Weighted" in opts.artificialPrizes: weightedPrizes = True # Assume batch mode batchMode = True if opts.iterMode == "random": batchMode = False # Load all of the proteins in the interactome, ignoring # genes. The artificial prizes will be created for a subset of these nodes. allProts = LoadProteins(opts.interactomePath, opts.undirectedFile, opts.directedFile, opts.tfdnaFile) # Load the negative prizes for the degree penalties or an empty dictionary # if they aren't being used directedFile = "None" if opts.directedFile != "None": directedFile = os.path.join(opts.interactomePath, opts.directedFile) degPenalties = NetworkUtil.DegreePenalties( opts.mu, os.path.join(opts.interactomePath, opts.undirectedFile), directedFile) # Create the initial stp files # New directory to hold the original data before the iterations begin # These stp files will be read and updated at subsequent iterations initPath = os.path.join(opts.resultPath, "initial") if not os.path.exists(initPath): os.makedirs(initPath) # Load the list of terminal files and the sample-to-group mapping terminalMap, sampleMap, countMap = LoadTerminalFiles( opts.terminalPath, opts.masterTerminalFile) # Store the groups in a fixed order groups = sorted(terminalMap.iterkeys()) for group in groups: print "%d samples in group %s" % (countMap[group], group) # Create a pool for creating .stp files and learning Steiner forests in parallel # using the specified number of workers. Use it to create the initial # .stp files. Even when running the subsequent iterations in random sequential # order, create a pool to learn the initial trees and final pruned trees (if applicable). print "Creating a pool with %d workers" % opts.workers pool = multiprocessing.Pool(opts.workers) initialStpMap = dict() for group in groups: terminalFiles = terminalMap[group] sampleNames = sampleMap[group] # opts and initPath are invariant arguments for each sample zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(initPath), terminalFiles, sampleNames) initialStpMap[group] = pool.map( CreateStpHelper, zippedArgs) # Blocks until all are finished # Store which proteins don't have prizes for each patient. # These are the nodes that could potentially be Steiner nodes for # each sample. This can't be recovered from the stp files at later # iterations because both original prizes and artificial prizes will exist. # Also track how the dummy node will be connected # to the networks, either all prizes or all non-prizes (potential Steiner nodes) potentialSteinerMap = dict() dummyNeighborMap = dict() for group in groups: numSamples = countMap[group] sampleNames = sampleMap[group] initialStps = initialStpMap[group] potentialSteiner = [] # A list of sets dummyNeighborFiles = [] # A list of filenames for i in range(numSamples): dnFile = sampleNames[i] + "_dummyNeighbors.txt" dummyNeighborFiles.append(dnFile) potentialSteiner.append( DummyNeighbors(allProts, initPath, initialStps[i], dnFile, opts.dummyNeighbors)) potentialSteinerMap[group] = potentialSteiner dummyNeighborMap[group] = dummyNeighborFiles itrPath = os.path.join(opts.resultPath, "itr1") if not os.path.exists(itrPath): os.makedirs(itrPath) # Initialize the artificial prizes to be an empty dictionary so that # we learn the initial trees independently artificialPrizes = dict() # Write the unused itr1 artificial prizes so that the files exist for post-processing for group in groups: NetworkUtil.WriteDict( os.path.join(itrPath, "artificialPrizes_%s.txt" % group), artificialPrizes) print "%d artificial prizes at iteration 1" % len(artificialPrizes) # Add the degree penalties to the initial stp files. Pass in the empty artificial prize # dictionary, which won't have an effect. for group in groups: sampleNames = sampleMap[group] numSamples = countMap[group] potentialSteiner = potentialSteinerMap[group] dummyNeighborFiles = dummyNeighborMap[group] for i in range(numSamples): # Copy the dummy neighbors, which must be in the same directory as the stp file UpdateStp(artificialPrizes, degPenalties, potentialSteiner[i], initPath, itrPath, sampleNames[i]) shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]), os.path.join(itrPath, dummyNeighborFiles[i])) # Learn the first iteration Steiner forests in parallel # Run single-threaded belief propagation when using the worker pool lastForestMap = dict() for group in groups: numSamples = countMap[group] sampleNames = sampleMap[group] dummyNeighborFiles = dummyNeighborMap[group] zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(itrPath), itertools.repeat(itrPath), sampleNames, dummyNeighborFiles, itertools.repeat(1)) pool.map(LearnSteinerHelper, zippedArgs) lastForests = [ ] # A list of sets, where each set contains the Steiner forest nodes for i in range(numSamples): lastForests.append( LoadForestNodes( "%s/symbol_%s_%s_1.0_%d.txt" % (itrPath, sampleNames[i], str(opts.W), opts.depth))) lastForestMap[group] = lastForests # Learn the forests at all remaining iterations and return the directory # that contains the forests from the last iteration. if opts.iterations > 1: if batchMode: itrPath = Batch(opts, pool, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties) else: itrPath = RandSequential(opts, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties) # Prune Steiner nodes from the forests that are not used to reach any prizes and # are only present because they were in the common set. # This is not necessary if only 1 iteration was run because in that case there # is no common set. # It is also not necessary if negative prizes were used. if opts.iterations > 1 and (not negativePrizes): print "Learning final forests" print "Pruning forests from %s" % itrPath finalPath = os.path.join(opts.resultPath, "final") if not os.path.exists(finalPath): os.makedirs(finalPath) # Nothing is returned by these operations so they can be performed # simultaneously independent of the groupings sampleNames = FlattenDict(sampleMap, groups) dummyNeighborFiles = FlattenDict(dummyNeighborMap, groups) potentialSteiner = FlattenDict(potentialSteinerMap, groups) for i in range(len(sampleNames)): forestFile = "%s/symbol_%s_%s_1.0_%d.txt" % ( itrPath, sampleNames[i], str(opts.W), opts.depth) FilterStpEdges(forestFile, initPath, finalPath, sampleNames[i], degPenalties, potentialSteiner[i]) shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]), os.path.join(finalPath, dummyNeighborFiles[i])) zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(finalPath), itertools.repeat(finalPath), sampleNames, dummyNeighborFiles, itertools.repeat(1)) pool.map(LearnSteinerHelper, zippedArgs) print "Finishing constrained multi-sample Steiner forest %s" % time.strftime( "%a, %d %b %Y %H:%M:%S", time.localtime()) pool.close()