def run(args): # Parse the arguments into shorter variable hadnles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq k = args.gene_set_size pvalThresh = 1.1 wf = args.weight_func # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq) m, n = mutations[0], mutations[1] if args.verbose: print('- Mutation data: %s genes x %s patients' % (m, n)) # Set up the CoMEt run and then run exhaustively cMutations = C.convert_mutations_to_C_format(*mutations) iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations genes = sorted(list(geneToIndex.keys()), key=lambda g: geneToIndex[g]) C.precompute_factorials(max(m, n)) C.set_weight(C.weightFunctionChars[wf]) results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases, pvalThresh) C.free_factorials() # Parse the output solns, weights, tables, probs = results res = list(zip(solns, weights, tables, probs)) res.sort(key=lambda arr: arr[1], reverse=True) # sort by weight decreasing solns = [sorted([genes[g] for g in geneset]) for geneset, w, t, p in res] weights = [w for g, w, t, p in res] tables = [t for g, w, t, p in res] probs = [p for g, w, t, p in res] # Output only sets, probs, and freqs as TSV with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf), "w") as outfile: output = [ "\t".join([", ".join(s), str(p), str(w)]) for s, p, w in zip(solns, probs, weights) ] output.insert(0, "#Gene set\tP-value\tFreq\tWeight") outfile.write("\n".join(output)) return list(zip(solns, probs, weights))
def run( args ): # Parse the arguments into shorter variable hadnles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq k = args.gene_set_size pvalThresh = 1.1 wf = args.weight_func # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq) m, n = mutations[0], mutations[1] if args.verbose: print '- Mutation data: %s genes x %s patients' % (m, n) # Set up the CoMEt run and then run exhaustively cMutations = C.convert_mutations_to_C_format(*mutations) iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations genes = sorted(geneToIndex.keys(), key=lambda g: geneToIndex[g]) C.precompute_factorials(max(m, n)) C.set_weight(C.weightFunctionChars[wf]) results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases, pvalThresh) C.free_factorials() # Parse the output solns, weights, tables, probs = results res = zip(solns, weights, tables, probs) res.sort(key=lambda arr: arr[1], reverse=True) # sort by weight decreasing solns = [ sorted([genes[g] for g in geneset]) for geneset, w, t, p in res] weights = [ w for g, w, t, p in res] tables = [ t for g, w, t, p in res] probs = [ p for g, w, t, p in res] # Output only sets, probs, and freqs as TSV with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf), "w") as outfile: output = [ "\t".join([ ", ".join(s), str(p), str(w)]) for s, p, w in zip(solns, probs, weights)] output.insert(0, "#Gene set\tP-value\tFreq\tWeight") outfile.write( "\n".join(output) ) return zip(solns, probs, weights)
def run(args): # Parse the arguments into shorter variable handles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq rc = args.num_initial t = len(args.gene_set_sizes) # number of pathways ks = args.gene_set_sizes # size of each pathway N = args.num_iterations # number of iteration s = args.step_length # step NStop = args.n_stop acc = args.accelerator nt = args.nt hybridCutoff = args.binom_cut NInc = 1.5 # increamental for non-converged chain tc = 1 # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq) m, n, genes, patients, geneToCases, patientToGenes = mutations if args.subtype: with open(args.subtype) as f: subSet = [l.rstrip() for l in f] else: subSet = list() if args.verbose: print 'Mutation data: %s genes x %s patients' % (m, n) # Precompute factorials C.precompute_factorials(max(m, n)) C.set_random_seed(args.seed) # stored the score of pre-computed collections into C if args.precomputed_scores: load_precomputed_scores(args.precomputed_scores, mutations, subSet) # num_initial > 1, perform convergence pipeline, otherwise, perform one run only if args.num_initial > 1: # collect initial soln from users, multidendrix and random. initialSolns, totalOut = initial_solns_generator( args.num_initial, mutations, ks, args.initial_soln, subSet) runN = N while True: lastSolns = list() for i in range(len(initialSolns)): init = initialSolns[i] outresults, lastSoln = comet(mutations, n, t, ks, runN, s, init, acc, subSet, nt, hybridCutoff, args.exact_cut, True) print "Mem usage: ", resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 1000 merge_runs(totalOut[i], outresults) lastSolns.append(lastSoln) finalTv = C.discrete_convergence(totalOut, int(N / s)) print finalTv, N newN = int(N * NInc) if newN > NStop or finalTv < args.total_distance_cutoff: break runN = newN - N N = newN initialSolns = lastSolns runNum = len(totalOut) results = merge_results(totalOut) printParameters(args, ks, finalTv) # store and output parameters into .json else: init = list() outresults, lastSoln = comet(mutations, n, t, ks, N, s, init, acc, subSet, nt, hybridCutoff, args.exact_cut, True) results = outresults runNum = 1 printParameters(args, ks, 1) C.free_factorials() # Output Comet results to TSV collections = sorted(results.keys(), key=lambda S: results[S]["total_weight"], reverse=True) header = "#Freq\tTotal Weight\tTarget Weight\t" header += "\t".join([ "Gene set %s (k=%s)\tProb %s\tWeight function %s" % (i, ks[i - 1], i, i) for i in range(1, len(ks) + 1) ]) tbl = [header] for S in collections: data = results[S] row = [ data["freq"], data["total_weight"], format(data["target_weight"], 'g') ] for d in sorted(data["sets"], key=lambda d: d["W"]): row += [", ".join(sorted(d["genes"])), d["prob"], d["num_tbls"]] tbl.append("\t".join(map(str, row))) outputFile = "%s.tsv" % iter_num(args.output_prefix + '.sum', N * (runNum), ks, args.accelerator) with open(outputFile, "w") as outfile: outfile.write("\n".join(tbl)) return [(S, results[S]["freq"], results[S]["total_weight"]) for S in collections]
def run( args ): # Parse the arguments into shorter variable handles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq rc = args.num_initial t = len(args.gene_set_sizes) # number of pathways ks = args.gene_set_sizes # size of each pathway N = args.num_iterations # number of iteration s = args.step_length # step NStop = args.n_stop acc = args.accelerator nt = args.nt hybridCutoff = args.binom_cut NInc = 1.5 # increamental for non-converged chain tc = 1 # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq) m, n, genes, patients, geneToCases, patientToGenes = mutations if args.subtype: with open(args.subtype) as f: subSet = [ l.rstrip() for l in f ] else: subSet = list() if args.verbose: print 'Mutation data: %s genes x %s patients' % (m, n) # Precompute factorials C.precompute_factorials(max(m, n)) C.set_random_seed(args.seed) # stored the score of pre-computed collections into C if args.precomputed_scores: load_precomputed_scores(args.precomputed_scores, mutations, subSet) # num_initial > 1, perform convergence pipeline, otherwise, perform one run only if args.num_initial > 1: # collect initial soln from users, multidendrix and random. initialSolns, totalOut = initial_solns_generator(args.num_initial, mutations, ks, args.initial_soln, subSet ) runN = N while True: lastSolns = list() for i in range(len(initialSolns)): init = initialSolns[i] outresults, lastSoln = comet(mutations, n, t, ks, runN, s, init, acc, subSet, nt, hybridCutoff, args.exact_cut, True) print "Mem usage: ", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000 merge_runs(totalOut[i], outresults) lastSolns.append(lastSoln) finalTv = C.discrete_convergence(totalOut, int(N/s)) print finalTv, N newN = int(N*NInc) if newN > NStop or finalTv < args.total_distance_cutoff: break runN = newN - N N = newN initialSolns = lastSolns runNum = len(totalOut) results = merge_results(totalOut) printParameters(args, ks, finalTv) # store and output parameters into .json else: init = list() outresults, lastSoln = comet(mutations, n, t, ks, N, s, init, acc, subSet, nt, hybridCutoff, args.exact_cut, True) results = outresults runNum = 1 printParameters(args, ks, 1) C.free_factorials() # Output Comet results to TSV collections = sorted(results.keys(), key=lambda S: results[S]["total_weight"], reverse=True) weight_func_mapping = {0: 'E', 1:'E', 2:'B', 3:'P'} header = "#Freq\tTotal Weight\tTarget Weight\t" header += "\t".join(["Gene set %s (k=%s)\tPhi %s\tWeight function %s" % (i, ks[i-1], i, i) for i in range(1, len(ks)+1)]) tbl = [header] for S in collections: data = results[S] row = [ data["freq"], data["total_weight"], format(data["target_weight"], 'g') ] for d in sorted(data["sets"], key=lambda d: d["W"]): row += [", ".join(sorted(d["genes"])), d["prob"], weight_func_mapping[d["num_tbls"]] ] tbl.append("\t".join(map(str, row))) outputFile = "%s.tsv" % iter_num(args.output_prefix + '.sum', N*(runNum), ks, args.accelerator) with open(outputFile, "w") as outfile: outfile.write( "\n".join(tbl) ) return [ (S, results[S]["freq"], results[S]["total_weight"]) for S in collections ]
def run( args ): # Set up the arguments for a general CoMEt run on real data realOutputDir = "{}/comet-results".format(args.output_directory) realCometArgs = [] permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-o"] for i, arg in enumerate(sys.argv[1:]): if arg not in permuteFlags and sys.argv[i] not in permuteFlags: realCometArgs.append( arg ) realCometArgs += [ "-o", realOutputDir, "--noviz"] # perform simple run without viz first. results = runComet(realCometArgs) # Load mutation data using Multi-Dendrix and output as a temporary file realMutations = C.load_mutation_data(args.mutation_matrix, args.patient_file, args.gene_file, args.min_freq, args.subtype) m, n, genes, patients, geneToCases, patientToGenes, subtypes = realMutations if args.verbose: print('* Mutation data: %s genes x %s patients' % (m, n)) # Construct bipartite graph from mutation data if args.verbose: print("* Creating bipartite graph...") G = C.construct_mutation_graph(geneToCases, patientToGenes) if args.verbose: print('\t- Graph has', len( G.edges() ), 'edges among', len( G.nodes() ), 'nodes.') # reset the arguments for a general CoMEt run on permuted matrices cometArgs = [] permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-m", "-o"] for i, arg in enumerate(sys.argv[1:]): if arg not in permuteFlags and sys.argv[i] not in permuteFlags: cometArgs.append( arg ) cometArgs.append('--noviz') # Create a permuted matrix, and then run it through CoMEt import tempfile arguments = [] if args.keep_temp_files: directory = args.output_directory else: directory = tempfile.mkdtemp(dir=".", prefix=".tmp") # Generate random seeds for each permutation random.seed(args.seed) seeds = [ random.randint(0, 2**31-1) for _ in range(args.num_permutations) ] for i, seed in enumerate(seeds): # Print simple progress bar sys.stdout.write("* Running CoMEt on permuted matrices... {}/{}\r".format(i+1, args.num_permutations)) sys.stdout.flush() # Create a permuted dataset and save it a temporary file mutations = C.permute_mutation_data(G, genes, patients, seed, args.Q) _, _, _, _, geneToCases, patientToGenes = mutations adj_list = [ p + "\t" + "\t".join( sorted(patientToGenes[p]) ) for p in patients ] permutation_file = "{}/permuted-matrix-{}.m2".format(directory, i+1) with open(permutation_file, 'w') as outfile: outfile.write('\n'.join(adj_list)) # Add the new arguments permuteArgs = list(map(str, cometArgs)) permuteArgs += [ "-m", permutation_file ] permuteArgs += [ "-o", "{}/comet-results-on-permutation-{}".format(directory, i+1)] arguments.append( permuteArgs ) if args.parallel: pool = mp.Pool(25) results = pool.map(runComet, arguments) pool.close() pool.join() else: results = [ runComet(permuteArgs) for permuteArgs in arguments ] # Find the maximum test statistic on the permuted datasets from itertools import islice maxStat = 0 for rf in [ rf for rf in os.listdir(directory) if rf.startswith("comet-results-on-permutation") ]: for df in [df for df in os.listdir("{}/{}/results".format(directory, rf) ) if df.endswith(".tsv")]: with open("{}/{}/results/{}".format(directory, rf, df)) as infile: for line in islice(infile, 1, 2): score = float(line.split("\t")[1]) if score > maxStat: maxStat = score print("*" * 80) print("Number of permutations:", args.num_permutations) print("Max statistic:", maxStat) # Prepare comet results on real, mutation data, and output directory for viz for rf in [rf for rf in os.listdir( "{}/results/".format(realOutputDir) ) if rf.endswith(".tsv")]: resultsTable = [l.rstrip() for l in open( "{}/results/{}".format(realOutputDir, rf))] realMutations = (m, n, genes, patients, geneToCases, patientToGenes ) outputDirViz = realOutputDir + "/viz/" C.ensure_dir(outputDirViz) # Perform visualization C.output_comet_viz(RC.get_parser().parse_args(realCometArgs), realMutations, resultsTable, maxStat, args.num_permutations) # Destroy the temporary directory if necessary if not args.keep_temp_files: import shutil shutil.rmtree(directory)
def run(args): ########################################################################### # Parse the arguments into shorter variable handles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq subtypeFile = args.subtype rc = args.num_initial t = len(args.gene_set_sizes) # number of pathways ks = args.gene_set_sizes # size of each pathway N = args.num_iterations # number of iteration s = args.step_length # step NStop = args.n_stop acc = args.accelerator nt = args.nt hybridCutoff = args.binom_cut NInc = 1.5 # increamental for non-converged chain # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq, subtypeFile) m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations mutations = (m, n, genes, patients, geneToCases, patientToGenes) ########################################################################### if args.verbose: print(f'Mutation data: {m} genes x {n} patients') if args.core_events: with open(args.core_events) as f: subSet = list(subtypes.union(set([l.rstrip() for l in f]))) else: subSet = list(subtypes) # Precompute factorials C.precompute_factorials(max(m, n)) C.set_random_seed(args.seed) # stored the score of pre-computed collections into C if args.precomputed_scores: C.load_precomputed_scores(args.precomputed_scores, mutations, subSet) # num_initial > 1, perform convergence pipeline, otherwise, perform one run only if args.num_initial > 1: # collect initial soln from users, multidendrix and random. initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \ mutations, ks, args.initial_soln, subSet, \ importMultidendrix, multi_dendrix) runN = N while True: lastSolns = list() for i in range(len(initialSolns)): init = initialSolns[i] outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \ init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose) C.merge_runs(totalOut[i], outresults) lastSolns.append(lastSoln) finalTv = C.discrete_convergence(totalOut, int(N / s)) print(finalTv, N) newN = int(N * NInc) if newN > NStop or finalTv < args.total_distance_cutoff: break runN = newN - N N = newN initialSolns = lastSolns runNum = len(totalOut) results = C.merge_results(totalOut) else: init = list() outresults, lastSoln = comet(mutations, n, t, ks, N, s, \ init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose) results = outresults runNum = 1 C.free_factorials() # Output comet results to TSV and website collections = sorted(results.keys(), key=lambda S: results[S]["total_weight"], reverse=True) C.output_comet(args, mutations, results, collections, ks, N * (runNum), 0, 0) return [(S, results[S]["freq"], results[S]["total_weight"]) for S in collections]
def run( args ): ########################################################################### # Parse the arguments into shorter variable handles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file eventNamesFile = args.event_names minFreq = args.min_freq msf = args.minimum_sampling_frequency inputFile = args.input_file statsFile = args.comet_stats_file sec = args.standard_error_cutoff mew = args.minimum_edge_weight # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq) m, n, genes, patients, geneToCases, patientToGenes = mutations eventNames = load_event_names(eventNamesFile, genes) ########################################################################### # Compute max weight from random data if users provide random data. # Otherwise, maxPermutedWeight = 0 if statsFile: with open(statsFile) as f: obj = json.load(f) maxPermutedWeight, N = obj['maxPermutedWeight'], obj['numPermutations'] else: maxPermutedWeight, N = 0, 0 ########################################################################### # Construct marginal probability graph from the input CoMEt results file if args.verbose: print "* Constructing marginal probability graph..." res = construct_mp_graph( inputFile, eventNames, msf, maxPermutedWeight ) MPG, tables, passPoint = res edges = MPG.edges(data=True) if args.verbose: print "\t- Edges:", len(edges) # Choose delta (the minimum edge weight in the marginal probability # graph ) using a heuristic approach that selects delta at first elbow # with slope change > 0 using linear regression if args.verbose: print "* Choosing delta..." deltas = sorted(set( d['weight'] for u, v, d in MPG.edges(data=True))) realEdgeDist = compute_edge_dist(MPG, deltas) deltaPoint, edgeno = choose_delta(deltas, realEdgeDist, passPoint, sec) if args.verbose: print "\t- Delta: ", deltaPoint ########################################################################### # Create the web output # Dictionary of web output obj = { "N": N, "mm": ['max-derivative'], "deltas": deltas, "edge_dist": realEdgeDist, "collections": { "max-derivative": { "more_extreme": 0, "pval": [0], "components": list(), "delta": deltaPoint } } } # TO-DO: HSIN-TA: Please comment, I have no idea what this does! collections = obj["collections"] deltas = [ dict(delta=collections[m]["delta"], pval=collections[m]["pval"], method=m, cdelta=min(obj["deltas"], key=lambda x:abs(x-collections[m]["delta"]))) for m in obj["mm"] ] plot=None # Write the delta plot to file as an SVG, then load # it so we can embed it in the web page if args.verbose: print "* Plotting delta curve..." tmp = tempfile.mktemp(".svg", dir=".", prefix=".tmp") delta_plot(obj, tmp, passPoint, deltaPoint, edgeno) # Read in the graph, skipping the first four lines that are # extraneous header information that will only confuse a webpage with open(tmp) as f: plot = "".join(f.readlines()[4:]) os.unlink(tmp) stats = dict(deltas=deltas, plot=plot, N=N) # Combine everything to create the D3 data if args.verbose: print "* Creating GD3 data..." graphData = gd3_graph(MPG, eventNames, mew) genesInResults = MPG.nodes() sampleToType = None if args.sample_types_file: with open(args.sample_types_file) as f: sampleToType = dict( l.rstrip().split("\t") for l in f ) mutations = gd3_mutation_data(*mutations, genespace=genesInResults, eventNames=eventNames, sampleToType=sampleToType) # Output the results to an HTML file if args.verbose: print "* Outputting..." htmlOutput = "{}/index.html".format(args.output_directory) with open(args.template_file) as template, open(htmlOutput, "w") as outfile: jsonData = json.dumps( dict(graph=graphData, mutations=mutations, tables=tables, stats=stats)) html = template.read() html += "\n<script>\nvar data = {};\ndplusViz(data);\n</script>\n".format(jsonData) outfile.write( html ) # Then copy the required JS and CSS files import shutil shutil.copyfile("comet/src/js/comet-viz.js", "{}/comet-viz.js".format(args.output_directory)) shutil.copyfile("comet/src/js/mp-graph.js", "{}/mp-graph.js".format(args.output_directory)) shutil.copyfile("comet/src/js/bower.json", "{}/bower.json".format(args.output_directory)) shutil.copyfile("comet/src/css/style.css", "{}/style.css".format(args.output_directory))
def run(args): # Load mutation data using Multi-Dendrix and output as a temporary file mutations = C.load_mutation_data(args.mutation_matrix, args.patient_file, args.gene_file, args.min_freq) m, n, genes, patients, geneToCases, patientToGenes = mutations if args.verbose: print '* Mutation data: %s genes x %s patients' % (m, n) # Construct bipartite graph from mutation data if args.verbose: print "* Creating bipartite graph..." G = C.construct_mutation_graph(geneToCases, patientToGenes) if args.verbose: print '\t- Graph has', len(G.edges()), 'edges among', len( G.nodes()), 'nodes.' # Set up the arguments for a general CoMEt run cometArgs = [] permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-m", "-o"] for i, arg in enumerate(sys.argv[1:]): if arg not in permuteFlags and sys.argv[i] not in permuteFlags: cometArgs.append(arg) # Create a permuted matrix, and then run it through CoMEt import tempfile arguments = [] if args.keep_temp_files: directory = args.output_directory else: directory = tempfile.mkdtemp(dir=".", prefix=".tmp") # Generate random seeds for each permutation random.seed(args.seed) seeds = [ random.randint(0, 2**31 - 1) for _ in range(args.num_permutations) ] for i, seed in enumerate(seeds): # Print simple progress bar sys.stdout.write( "* Running CoMEt on permuted matrices... {}/{}\r".format( i + 1, args.num_permutations)) sys.stdout.flush() # Create a permuted dataset and save it a temporary file mutations = C.permute_mutation_data(G, genes, patients, seed, args.Q) _, _, _, _, geneToCases, patientToGenes = mutations adj_list = [ p + "\t" + "\t".join(sorted(patientToGenes[p])) for p in patients ] permutation_file = "{}/permuted-matrix-{}.m2".format(directory, i + 1) with open(permutation_file, 'w') as outfile: outfile.write('\n'.join(adj_list)) # Add the new arguments permuteArgs = map(str, cometArgs) permuteArgs += ["-m", permutation_file] permuteArgs += [ "-o", "{}/comet-results-on-permutation-{}".format(directory, i + 1) ] arguments.append(permuteArgs) if args.parallel: pool = mp.Pool(25) results = pool.map(runComet, arguments) pool.close() pool.join() else: results = [runComet(permuteArgs) for permuteArgs in arguments] # Find the maximum test statistic on the permuted datasets from itertools import islice maxStat = 0 for rf in [ rf for rf in os.listdir(directory) if rf.startswith("comet-results") ]: with open("{}/{}".format(directory, rf)) as infile: for line in islice(infile, 1, 2): score = float(line.split("\t")[1]) if score > maxStat: maxStat = score print "*" * 80 print "Number of permutations:", args.num_permutations print "Max statistic:", maxStat # Output the results to files with open("{}/comet-stats.json".format(args.output_directory), "w") as outfile: output = dict(maxPermutedWeight=maxStat, numPermutations=args.num_permutations, keepTempFiles=args.keep_temp_files, mutationNatrix=args.mutation_matrix, geneFile=args.gene_file, patientFile=args.patient_file, minFreq=args.min_freq, Q=args.Q) json.dump(output, outfile, sort_keys=True, indent=4) # Destroy the temporary directory if necessary if not args.keep_temp_files: import shutil shutil.rmtree(directory)
def run( args ): ########################################################################### # Parse the arguments into shorter variable handles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file eventNamesFile = args.event_names minFreq = args.min_freq msf = args.minimum_sampling_frequency inputFile = args.input_file statsFile = args.comet_stats_file sec = args.standard_error_cutoff mew = args.minimum_edge_weight # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq) m, n, genes, patients, geneToCases, patientToGenes = mutations eventNames = load_event_names(eventNamesFile, genes) ########################################################################### # Compute max weight from random data if users provide random data. # Otherwise, maxPermutedWeight = 0 if statsFile: with open(statsFile) as f: obj = json.load(f) maxPermutedWeight, N = obj['maxPermutedWeight'], obj['numPermutations'] else: maxPermutedWeight, N = 0, 0 ########################################################################### # Construct marginal probability graph from the input CoMEt results file if args.verbose: print "* Constructing marginal probability graph..." res = construct_mp_graph( inputFile, eventNames, msf, maxPermutedWeight ) MPG, tables, expectedPoint = res edges = MPG.edges(data=True) if len(edges) == 0: # no significant results print "No significant collection. " exit(1) if args.verbose: print "\t- Edges:", len(edges) # Choose delta (the minimum edge weight in the marginal probability # graph ) using a heuristic approach that selects delta at first elbow # with slope change > 0 using linear regression if args.verbose: print "* Choosing delta..." deltas = sorted(set( d['weight'] for u, v, d in MPG.edges(data=True))) realEdgeDist = compute_edge_dist(MPG, deltas) deltaPoint, edgeno = choose_delta(deltas, realEdgeDist, expectedPoint, sec) if args.verbose: print "\t- Delta: ", deltaPoint ########################################################################### # Create the web output plot=None # Write the delta plot to file as an SVG, then load # it so we can embed it in the web page if args.verbose: print "* Plotting delta curve..." tmp = tempfile.mktemp(".svg", dir=".", prefix=".tmp") delta_plot(N, deltas, realEdgeDist, tmp, expectedPoint, deltaPoint, edgeno) # Read in the graph, skipping the first four lines that are # extraneous header information that will only confuse a webpage with open(tmp) as f: plot = "".join(f.readlines()[4:]) os.unlink(tmp) stats = dict(deltas=[dict(delta=deltaPoint, pval=0.)], plot=plot, N=N) # Combine everything to create the D3 data if args.verbose: print "* Creating GD3 data..." graphData = gd3_graph(MPG, eventNames, mew) genesInResults = MPG.nodes() sampleToType = None if args.sample_types_file: with open(args.sample_types_file) as f: sampleToType = dict( l.rstrip().split("\t") for l in f ) mutations = gd3_mutation_data(*mutations, genespace=genesInResults, eventNames=eventNames, sampleToType=sampleToType) # Output the results to an HTML file if args.verbose: print "* Outputting..." htmlOutput = "{}/index.html".format(args.output_directory) with open(args.template_file) as template, open(htmlOutput, "w") as outfile: jsonData = json.dumps( dict(graph=graphData, mutations=mutations, tables=tables, stats=stats)) html = template.read() html += "\n<script>\nvar data = {};\ndplusViz(data);\n</script>\n".format(jsonData) outfile.write( html ) # Then copy the required JS and CSS files import shutil shutil.copyfile("comet/src/js/comet-viz.js", "{}/comet-viz.js".format(args.output_directory)) shutil.copyfile("comet/src/js/mp-graph.js", "{}/mp-graph.js".format(args.output_directory)) shutil.copyfile("comet/src/js/bower.json", "{}/bower.json".format(args.output_directory)) shutil.copyfile("comet/src/css/style.css", "{}/style.css".format(args.output_directory))
def run( args ): # Load mutation data using Multi-Dendrix and output as a temporary file mutations = C.load_mutation_data(args.mutation_matrix, args.patient_file, args.gene_file, args.min_freq) m, n, genes, patients, geneToCases, patientToGenes = mutations if args.verbose: print '* Mutation data: %s genes x %s patients' % (m, n) # Construct bipartite graph from mutation data if args.verbose: print "* Creating bipartite graph..." G = C.construct_mutation_graph(geneToCases, patientToGenes) if args.verbose: print '\t- Graph has', len( G.edges() ), 'edges among', len( G.nodes() ), 'nodes.' # Set up the arguments for a general CoMEt run cometArgs = [] permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-m", "-o"] for i, arg in enumerate(sys.argv[1:]): if arg not in permuteFlags and sys.argv[i] not in permuteFlags: cometArgs.append( arg ) # Create a permuted matrix, and then run it through CoMEt import tempfile arguments = [] if args.keep_temp_files: directory = args.output_directory else: directory = tempfile.mkdtemp(dir=".", prefix=".tmp") for i in range(args.num_permutations): # Print simple progress bar sys.stdout.write("* Running CoMEt on permuted matrices... {}/{}\r".format(i+1, n)) sys.stdout.flush() # Create a permuted dataset and save it a temporary file mutations = C.permute_mutation_data(G, genes, patients, args.seed, args.Q) _, _, _, _, geneToCases, patientToGenes = mutations adj_list = [ p + "\t" + "\t".join( sorted(patientToGenes[p]) ) for p in patients ] permutation_file = "{}/permuted-matrix-{}.m2".format(directory, i+1) with open(permutation_file, 'w') as outfile: outfile.write('\n'.join(adj_list)) # Add the new arguments permuteArgs = map(str, cometArgs) permuteArgs += [ "-m", permutation_file ] permuteArgs += [ "-o", "{}/comet-results-on-permutation-{}".format(directory, i+1)] arguments.append( permuteArgs ) if args.parallel: pool = mp.Pool(25) results = pool.map(runComet, arguments) pool.close() pool.join() else: results = [ runComet(permuteArgs) for permuteArgs in arguments ] # Find the maximum test statistic on the permuted datasets from itertools import islice maxStat = 0 for rf in [ rf for rf in os.listdir(directory) if rf.startswith("comet-results") ]: with open("{}/{}".format(directory, rf)) as infile: for line in islice(infile, 1, 2): score = float(line.split("\t")[1]) if score > maxStat: maxStat = score print "*" * 80 print "Number of permutations:", args.num_permutations print "Max statistic:", maxStat # Output the results to files with open("{}/comet-stats.json".format(args.output_directory), "w") as outfile: output = dict(maxPermutedWeight=maxStat, numPermutations=args.num_permutations, keepTempFiles=args.keep_temp_files, mutationNatrix=args.mutation_matrix, geneFile=args.gene_file, patientFile=args.patient_file, minFreq=args.min_freq, Q=args.Q) json.dump( output, outfile, sort_keys=True, indent=4) # Destroy the temporary directory if necessary if not args.keep_temp_files: import shutil shutil.rmtree(directory)
def run(args): # Set up the arguments for a general CoMEt run on real data realOutputDir = "{}/comet-results".format(args.output_directory) realCometArgs = [] permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-o"] for i, arg in enumerate(sys.argv[1:]): if arg not in permuteFlags and sys.argv[i] not in permuteFlags: realCometArgs.append(arg) realCometArgs += ["-o", realOutputDir, "--noviz"] # perform simple run without viz first. results = runComet(realCometArgs) # Load mutation data using Multi-Dendrix and output as a temporary file realMutations = C.load_mutation_data(args.mutation_matrix, args.patient_file, args.gene_file, args.min_freq, args.subtype) m, n, genes, patients, geneToCases, patientToGenes, subtypes = realMutations if args.verbose: print(f'* Mutation data: {m} genes x {n} patients') # Construct bipartite graph from mutation data if args.verbose: print('* Creating bipartite graph...') G = C.construct_mutation_graph(geneToCases, patientToGenes) if args.verbose: print('\t- Graph has', len(G.edges()), 'edges among', len(G.nodes()), 'nodes.') # reset the arguments for a general CoMEt run on permuted matrices cometArgs = [] permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-m", "-o"] for i, arg in enumerate(sys.argv[1:]): if arg not in permuteFlags and sys.argv[i] not in permuteFlags: cometArgs.append(arg) cometArgs.append('--noviz') # Create a permuted matrix, and then run it through CoMEt import tempfile arguments = [] if args.keep_temp_files: directory = args.output_directory else: directory = tempfile.mkdtemp(dir=".", prefix=".tmp") # Generate random seeds for each permutation random.seed(args.seed) seeds = [ random.randint(0, 2**31 - 1) for _ in range(args.num_permutations) ] for i, seed in enumerate(seeds): # Print simple progress bar sys.stdout.write( "* Running CoMEt on permuted matrices... {}/{}\r".format( i + 1, args.num_permutations)) sys.stdout.flush() # Create a permuted dataset and save it a temporary file mutations = C.permute_mutation_data(G, genes, patients, seed, args.Q) _, _, _, _, geneToCases, patientToGenes = mutations adj_list = [ p + "\t" + "\t".join(sorted(patientToGenes[p])) for p in patients ] permutation_file = "{}/permuted-matrix-{}.m2".format(directory, i + 1) with open(permutation_file, 'w') as outfile: outfile.write('\n'.join(adj_list)) # Add the new arguments permuteArgs = list(map(str, cometArgs)) permuteArgs += ["-m", permutation_file] permuteArgs += [ "-o", "{}/comet-results-on-permutation-{}".format(directory, i + 1) ] arguments.append(permuteArgs) if args.parallel: pool = mp.Pool(25) results = pool.map(runComet, arguments) pool.close() pool.join() else: results = [runComet(permuteArgs) for permuteArgs in arguments] # Find the maximum test statistic on the permuted datasets from itertools import islice maxStat = 0 for rf in [ rf for rf in os.listdir(directory) if rf.startswith("comet-results-on-permutation") ]: for df in [ df for df in os.listdir("{}/{}/results".format(directory, rf)) if df.endswith(".tsv") ]: with open("{}/{}/results/{}".format(directory, rf, df)) as infile: for line in islice(infile, 1, 2): score = float(line.split("\t")[1]) if score > maxStat: maxStat = score print("*" * 80) print("Number of permutations:", args.num_permutations) print("Max statistic:", maxStat) # Prepare comet results on real, mutation data, and output directory for viz for rf in [ rf for rf in os.listdir("{}/results/".format(realOutputDir)) if (not rf.startswith('.') and rf.endswith(".tsv")) ]: resultsTable = [ l.rstrip() for l in open("{}/results/{}".format(realOutputDir, rf)) ] realMutations = (m, n, genes, patients, geneToCases, patientToGenes) outputDirViz = realOutputDir + "/viz/" C.ensure_dir(outputDirViz) # Perform visualization C.output_comet_viz(RC.get_parser().parse_args(realCometArgs), realMutations, \ resultsTable, maxStat, args.num_permutations) # Destroy the temporary directory if necessary if not args.keep_temp_files: import shutil shutil.rmtree(directory)
def run( args ): ########################################################################### # Parse the arguments into shorter variable handles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq subtypeFile = args.subtype rc = args.num_initial t = len(args.gene_set_sizes) # number of pathways ks = args.gene_set_sizes # size of each pathway N = args.num_iterations # number of iteration s = args.step_length # step NStop = args.n_stop acc = args.accelerator nt = args.nt hybridCutoff = args.binom_cut NInc = 1.5 # increamental for non-converged chain # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq, subtypeFile) m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations mutations = ( m, n, genes, patients, geneToCases, patientToGenes ) ########################################################################### if args.verbose: print('Mutation data: %s genes x %s patients' % (m, n)) if args.core_events: with open(args.core_events) as f: subSet = list( subtypes.union( set( [ l.rstrip() for l in f ] ) ) ) else: subSet = list( subtypes ) # Precompute factorials C.precompute_factorials(max(m, n)) C.set_random_seed(args.seed) # stored the score of pre-computed collections into C if args.precomputed_scores: C.load_precomputed_scores(args.precomputed_scores, mutations, subSet) # num_initial > 1, perform convergence pipeline, otherwise, perform one run only if args.num_initial > 1: # collect initial soln from users, multidendrix and random. initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \ mutations, ks, args.initial_soln, subSet, \ importMultidendrix, multi_dendrix) runN = N while True: lastSolns = list() for i in range(len(initialSolns)): init = initialSolns[i] outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \ init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose) C.merge_runs(totalOut[i], outresults) lastSolns.append(lastSoln) finalTv = C.discrete_convergence(totalOut, int(N/s)) print(finalTv, N) newN = int(N*NInc) if newN > NStop or finalTv < args.total_distance_cutoff: break runN = newN - N N = newN initialSolns = lastSolns runNum = len(totalOut) results = C.merge_results(totalOut) else: init = list() outresults, lastSoln = comet(mutations, n, t, ks, N, s, \ init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose) results = outresults runNum = 1 C.free_factorials() # Output comet results to TSV and website collections = sorted(results, key=lambda S: results[S]["total_weight"], reverse=True) C.output_comet(args, mutations, results, collections, ks, N*(runNum), 0, 0) return [ (S, results[S]["freq"], results[S]["total_weight"]) for S in collections ]