def load_precomputed_scores(infile, mutations, subt): if subt: mutations = mutations + (subt,) cMutations = C.convert_mutations_to_C_format(*mutations) iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations baseI = 3 # sampling freq., total weight, target weight setI = 3 # gene set, score, weight function matchObj = re.match( r'.+\.k(\d+)\..+?', infile) loadingT = len(matchObj.group(1)) # determine t:the number of gene sets. for l in open(infile): if not l.startswith("#"): v = l.rstrip().split("\t") j = 0 for i in range(loadingT): gSet = [geneToIndex[g] for g in v[baseI + j].split(", ")] C.load_precomputed_scores(float(v[baseI + j + 1]), len(v[baseI + j].split(", ")), int(v[baseI + j + 2]), gSet) j += setI
def run(args): ########################################################################### # Parse the arguments into shorter variable handles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq subtypeFile = args.subtype rc = args.num_initial t = len(args.gene_set_sizes) # number of pathways ks = args.gene_set_sizes # size of each pathway N = args.num_iterations # number of iteration s = args.step_length # step NStop = args.n_stop acc = args.accelerator nt = args.nt hybridCutoff = args.binom_cut NInc = 1.5 # increamental for non-converged chain # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq, subtypeFile) m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations mutations = (m, n, genes, patients, geneToCases, patientToGenes) ########################################################################### if args.verbose: print(f'Mutation data: {m} genes x {n} patients') if args.core_events: with open(args.core_events) as f: subSet = list(subtypes.union(set([l.rstrip() for l in f]))) else: subSet = list(subtypes) # Precompute factorials C.precompute_factorials(max(m, n)) C.set_random_seed(args.seed) # stored the score of pre-computed collections into C if args.precomputed_scores: C.load_precomputed_scores(args.precomputed_scores, mutations, subSet) # num_initial > 1, perform convergence pipeline, otherwise, perform one run only if args.num_initial > 1: # collect initial soln from users, multidendrix and random. initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \ mutations, ks, args.initial_soln, subSet, \ importMultidendrix, multi_dendrix) runN = N while True: lastSolns = list() for i in range(len(initialSolns)): init = initialSolns[i] outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \ init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose) C.merge_runs(totalOut[i], outresults) lastSolns.append(lastSoln) finalTv = C.discrete_convergence(totalOut, int(N / s)) print(finalTv, N) newN = int(N * NInc) if newN > NStop or finalTv < args.total_distance_cutoff: break runN = newN - N N = newN initialSolns = lastSolns runNum = len(totalOut) results = C.merge_results(totalOut) else: init = list() outresults, lastSoln = comet(mutations, n, t, ks, N, s, \ init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose) results = outresults runNum = 1 C.free_factorials() # Output comet results to TSV and website collections = sorted(results.keys(), key=lambda S: results[S]["total_weight"], reverse=True) C.output_comet(args, mutations, results, collections, ks, N * (runNum), 0, 0) return [(S, results[S]["freq"], results[S]["total_weight"]) for S in collections]
def run( args ): ########################################################################### # Parse the arguments into shorter variable handles mutationMatrix = args.mutation_matrix geneFile = args.gene_file patientFile = args.patient_file minFreq = args.min_freq subtypeFile = args.subtype rc = args.num_initial t = len(args.gene_set_sizes) # number of pathways ks = args.gene_set_sizes # size of each pathway N = args.num_iterations # number of iteration s = args.step_length # step NStop = args.n_stop acc = args.accelerator nt = args.nt hybridCutoff = args.binom_cut NInc = 1.5 # increamental for non-converged chain # Load the mutation data mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq, subtypeFile) m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations mutations = ( m, n, genes, patients, geneToCases, patientToGenes ) ########################################################################### if args.verbose: print('Mutation data: %s genes x %s patients' % (m, n)) if args.core_events: with open(args.core_events) as f: subSet = list( subtypes.union( set( [ l.rstrip() for l in f ] ) ) ) else: subSet = list( subtypes ) # Precompute factorials C.precompute_factorials(max(m, n)) C.set_random_seed(args.seed) # stored the score of pre-computed collections into C if args.precomputed_scores: C.load_precomputed_scores(args.precomputed_scores, mutations, subSet) # num_initial > 1, perform convergence pipeline, otherwise, perform one run only if args.num_initial > 1: # collect initial soln from users, multidendrix and random. initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \ mutations, ks, args.initial_soln, subSet, \ importMultidendrix, multi_dendrix) runN = N while True: lastSolns = list() for i in range(len(initialSolns)): init = initialSolns[i] outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \ init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose) C.merge_runs(totalOut[i], outresults) lastSolns.append(lastSoln) finalTv = C.discrete_convergence(totalOut, int(N/s)) print(finalTv, N) newN = int(N*NInc) if newN > NStop or finalTv < args.total_distance_cutoff: break runN = newN - N N = newN initialSolns = lastSolns runNum = len(totalOut) results = C.merge_results(totalOut) else: init = list() outresults, lastSoln = comet(mutations, n, t, ks, N, s, \ init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose) results = outresults runNum = 1 C.free_factorials() # Output comet results to TSV and website collections = sorted(results, key=lambda S: results[S]["total_weight"], reverse=True) C.output_comet(args, mutations, results, collections, ks, N*(runNum), 0, 0) return [ (S, results[S]["freq"], results[S]["total_weight"]) for S in collections ]