def comet(mutations, n, t, ks, numIters, stepLen, initialSoln, amp, subt, nt, hybridPvalThreshold, pvalThresh, verbose): # Convert mutation data to C-ready format if subt: mutations = mutations + (subt, ) cMutations = C.convert_mutations_to_C_format(*mutations) iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations initialSolnIndex = [geneToIndex[g] for g in initialSoln] solns = C.comet(t, mutations[0], mutations[1], iPatientToGenes, geneToNumCases, ks, numIters, stepLen, amp, nt, hybridPvalThreshold, initialSolnIndex, len(subt), pvalThresh, verbose) # Collate the results and sort them descending by sampling frequency solnsWithWeights = C.convert_solns(indexToGene, solns) def collection_key(collection): return " ".join(sorted([",".join(sorted(M)) for M in collection])) results = dict() # store last soln of sampling for more iterations lastSoln = list() for gset in solnsWithWeights[-1][0]: for g in gset: lastSoln.append(g) for collection, Ws, Cs in solnsWithWeights: key = collection_key(collection) if key in results: results[key]["freq"] += 1 else: sets = [] for i in range(len(collection)): M = collection[i] W = Ws[i] F = Cs[i] # extract the probability from the weight, # which can also include the accelerator P = pow(exp(-W), 1. / amp) sets.append(dict(genes=M, W=W, num_tbls=F, prob=P)) totalWeight = sum([S["W"] for S in sets]) targetWeight = exp(totalWeight) if totalWeight < 700 else 1e1000 results[key] = dict(freq=1, sets=sets, total_weight=totalWeight, target_weight=targetWeight) return results, lastSoln
def run(args): # Parse the arguments into shorter variable hadnles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq k = args.gene_set_size pvalThresh = 1.1 wf = args.weight_func # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq) m, n = mutations[0], mutations[1] if args.verbose: print('- Mutation data: %s genes x %s patients' % (m, n)) # Set up the CoMEt run and then run exhaustively cMutations = C.convert_mutations_to_C_format(*mutations) iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations genes = sorted(list(geneToIndex.keys()), key=lambda g: geneToIndex[g]) C.precompute_factorials(max(m, n)) C.set_weight(C.weightFunctionChars[wf]) results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases, pvalThresh) C.free_factorials() # Parse the output solns, weights, tables, probs = results res = list(zip(solns, weights, tables, probs)) res.sort(key=lambda arr: arr[1], reverse=True) # sort by weight decreasing solns = [sorted([genes[g] for g in geneset]) for geneset, w, t, p in res] weights = [w for g, w, t, p in res] tables = [t for g, w, t, p in res] probs = [p for g, w, t, p in res] # Output only sets, probs, and freqs as TSV with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf), "w") as outfile: output = [ "\t".join([", ".join(s), str(p), str(w)]) for s, p, w in zip(solns, probs, weights) ] output.insert(0, "#Gene set\tP-value\tFreq\tWeight") outfile.write("\n".join(output)) return list(zip(solns, probs, weights))
def comet(mutations, n, t, ks, numIters, stepLen, initialSoln, amp, subt, nt, hybridPvalThreshold, pvalThresh, verbose): # Convert mutation data to C-ready format if subt: mutations = mutations + (subt, ) cMutations = C.convert_mutations_to_C_format(*mutations) iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations initialSolnIndex = [geneToIndex[g] for g in initialSoln] solns = C.comet(t, mutations[0], mutations[1], iPatientToGenes, geneToNumCases, ks, numIters, stepLen, amp, nt, hybridPvalThreshold, initialSolnIndex, len(subt), pvalThresh, verbose) # Collate the results and sort them descending by sampling frequency solnsWithWeights = convert_solns( indexToGene, solns ) def collection_key(collection): return " ".join(sorted([",".join(sorted(M)) for M in collection])) results = dict() # store last soln of sampling for more iterations lastSoln = list() for gset in solnsWithWeights[-1][0]: for g in gset: lastSoln.append(g) for collection, Ws, Cs in solnsWithWeights: key = collection_key(collection) if key in results: results[key]["freq"] += 1 else: sets = [] for i in range(len(collection)): M = collection[i] W = Ws[i] F = Cs[i] # extract the probability from the weight, # which can also include the accelerator P = pow(exp(-W), 1./amp) sets.append( dict(genes=M, W=W, num_tbls=F, prob=P) ) totalWeight = sum([ S["W"] for S in sets ]) targetWeight = exp( totalWeight ) if totalWeight < 700 else 1e1000 results[key] = dict(freq=1, sets=sets, total_weight=totalWeight, target_weight=targetWeight) return results, lastSoln
def run( args ): # Parse the arguments into shorter variable hadnles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq k = args.gene_set_size pvalThresh = 1.1 wf = args.weight_func # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq) m, n = mutations[0], mutations[1] if args.verbose: print '- Mutation data: %s genes x %s patients' % (m, n) # Set up the CoMEt run and then run exhaustively cMutations = C.convert_mutations_to_C_format(*mutations) iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations genes = sorted(geneToIndex.keys(), key=lambda g: geneToIndex[g]) C.precompute_factorials(max(m, n)) C.set_weight(C.weightFunctionChars[wf]) results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases, pvalThresh) C.free_factorials() # Parse the output solns, weights, tables, probs = results res = zip(solns, weights, tables, probs) res.sort(key=lambda arr: arr[1], reverse=True) # sort by weight decreasing solns = [ sorted([genes[g] for g in geneset]) for geneset, w, t, p in res] weights = [ w for g, w, t, p in res] tables = [ t for g, w, t, p in res] probs = [ p for g, w, t, p in res] # Output only sets, probs, and freqs as TSV with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf), "w") as outfile: output = [ "\t".join([ ", ".join(s), str(p), str(w)]) for s, p, w in zip(solns, probs, weights)] output.insert(0, "#Gene set\tP-value\tFreq\tWeight") outfile.write( "\n".join(output) ) return zip(solns, probs, weights)
def load_precomputed_scores(infile, mutations, subt): if subt: mutations = mutations + (subt,) cMutations = C.convert_mutations_to_C_format(*mutations) iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations baseI = 3 # sampling freq., total weight, target weight setI = 3 # gene set, score, weight function matchObj = re.match( r'.+\.k(\d+)\..+?', infile) loadingT = len(matchObj.group(1)) # determine t:the number of gene sets. for l in open(infile): if not l.startswith("#"): v = l.rstrip().split("\t") j = 0 for i in range(loadingT): gSet = [geneToIndex[g] for g in v[baseI + j].split(", ")] C.load_precomputed_scores(float(v[baseI + j + 1]), len(v[baseI + j].split(", ")), int(v[baseI + j + 2]), gSet) j += setI