Beispiel #1
0
def run(args):
    # Parse the arguments into shorter variable hadnles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    k = args.gene_set_size
    pvalThresh = 1.1
    wf = args.weight_func

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile,
                                     minFreq)
    m, n = mutations[0], mutations[1]
    if args.verbose:
        print('- Mutation data: %s genes x %s patients' % (m, n))

    # Set up the CoMEt run and then run exhaustively
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations
    genes = sorted(list(geneToIndex.keys()), key=lambda g: geneToIndex[g])

    C.precompute_factorials(max(m, n))
    C.set_weight(C.weightFunctionChars[wf])
    results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases,
                           pvalThresh)
    C.free_factorials()

    # Parse the output
    solns, weights, tables, probs = results
    res = list(zip(solns, weights, tables, probs))
    res.sort(key=lambda arr: arr[1], reverse=True)  # sort by weight decreasing
    solns = [sorted([genes[g] for g in geneset]) for geneset, w, t, p in res]
    weights = [w for g, w, t, p in res]
    tables = [t for g, w, t, p in res]
    probs = [p for g, w, t, p in res]

    # Output only sets, probs, and freqs as TSV
    with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf),
              "w") as outfile:
        output = [
            "\t".join([", ".join(s), str(p), str(w)])
            for s, p, w in zip(solns, probs, weights)
        ]
        output.insert(0, "#Gene set\tP-value\tFreq\tWeight")
        outfile.write("\n".join(output))

    return list(zip(solns, probs, weights))
Beispiel #2
0
def run( args ):
    # Parse the arguments into shorter variable hadnles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    k = args.gene_set_size
    pvalThresh = 1.1
    wf = args.weight_func

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq)
    m, n = mutations[0], mutations[1]
    if args.verbose:
        print '- Mutation data: %s genes x %s patients' % (m, n)

    # Set up the CoMEt run and then run exhaustively
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations
    genes = sorted(geneToIndex.keys(), key=lambda g: geneToIndex[g])

    C.precompute_factorials(max(m, n))
    C.set_weight(C.weightFunctionChars[wf])
    results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases, pvalThresh)
    C.free_factorials()

    # Parse the output
    solns, weights, tables, probs = results
    res = zip(solns, weights, tables, probs)
    res.sort(key=lambda arr: arr[1], reverse=True) # sort by weight decreasing
    solns   = [ sorted([genes[g] for g in geneset]) for geneset, w, t, p in res]
    weights = [ w for g, w, t, p in res]
    tables  = [ t for g, w, t, p in res]
    probs   = [ p for g, w, t, p in res]

    # Output only sets, probs, and freqs as TSV
    with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf), "w") as outfile:
        output = [ "\t".join([ ", ".join(s), str(p), str(w)])
                   for s, p, w in zip(solns, probs, weights)]
        output.insert(0, "#Gene set\tP-value\tFreq\tWeight")
        outfile.write( "\n".join(output) )

    return zip(solns, probs, weights)
Beispiel #3
0
def run(args):
    # Parse the arguments into shorter variable handles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    rc = args.num_initial
    t = len(args.gene_set_sizes)  # number of pathways
    ks = args.gene_set_sizes  # size of each pathway
    N = args.num_iterations  # number of iteration
    s = args.step_length  # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5  # increamental for non-converged chain
    tc = 1

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile,
                                     minFreq)
    m, n, genes, patients, geneToCases, patientToGenes = mutations

    if args.subtype:
        with open(args.subtype) as f:
            subSet = [l.rstrip() for l in f]
    else:
        subSet = list()

    if args.verbose:
        print 'Mutation data: %s genes x %s patients' % (m, n)

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = initial_solns_generator(
            args.num_initial, mutations, ks, args.initial_soln, subSet)
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s,
                                             init, acc, subSet, nt,
                                             hybridCutoff, args.exact_cut,
                                             True)
                print "Mem usage: ", resource.getrusage(
                    resource.RUSAGE_SELF).ru_maxrss / 1000
                merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N / s))
            print finalTv, N

            newN = int(N * NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = merge_results(totalOut)
        printParameters(args, ks,
                        finalTv)  # store and output parameters into .json

    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, init, acc,
                                     subSet, nt, hybridCutoff, args.exact_cut,
                                     True)
        results = outresults
        runNum = 1
        printParameters(args, ks, 1)

    C.free_factorials()

    # Output Comet results to TSV
    collections = sorted(results.keys(),
                         key=lambda S: results[S]["total_weight"],
                         reverse=True)
    header = "#Freq\tTotal Weight\tTarget Weight\t"
    header += "\t".join([
        "Gene set %s (k=%s)\tProb %s\tWeight function %s" %
        (i, ks[i - 1], i, i) for i in range(1,
                                            len(ks) + 1)
    ])
    tbl = [header]
    for S in collections:
        data = results[S]
        row = [
            data["freq"], data["total_weight"],
            format(data["target_weight"], 'g')
        ]
        for d in sorted(data["sets"], key=lambda d: d["W"]):
            row += [", ".join(sorted(d["genes"])), d["prob"], d["num_tbls"]]
        tbl.append("\t".join(map(str, row)))

    outputFile = "%s.tsv" % iter_num(args.output_prefix + '.sum', N *
                                     (runNum), ks, args.accelerator)
    with open(outputFile, "w") as outfile:
        outfile.write("\n".join(tbl))

    return [(S, results[S]["freq"], results[S]["total_weight"])
            for S in collections]
Beispiel #4
0
def run( args ):
    # Parse the arguments into shorter variable handles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    rc    = args.num_initial
    t     = len(args.gene_set_sizes) # number of pathways
    ks    = args.gene_set_sizes      # size of each pathway
    N     = args.num_iterations      # number of iteration
    s     = args.step_length         # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5                 # increamental for non-converged chain
    tc   = 1

	# Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq)
    m, n, genes, patients, geneToCases, patientToGenes = mutations

    if args.subtype:
        with open(args.subtype) as f:
            subSet = [ l.rstrip() for l in f ]
    else:
        subSet = list()

    if args.verbose:
        print 'Mutation data: %s genes x %s patients' % (m, n)

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = initial_solns_generator(args.num_initial, mutations, ks, args.initial_soln, subSet )
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s, init, acc, subSet, nt, hybridCutoff, args.exact_cut, True)
                print "Mem usage: ", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
                merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N/s))
            print finalTv, N

            newN = int(N*NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = merge_results(totalOut)
        printParameters(args, ks, finalTv) # store and output parameters into .json


    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, init, acc, subSet, nt, hybridCutoff, args.exact_cut, True)
        results = outresults
        runNum = 1
        printParameters(args, ks, 1)

    C.free_factorials()

    # Output Comet results to TSV
    collections = sorted(results.keys(), key=lambda S: results[S]["total_weight"], reverse=True)
    weight_func_mapping = {0: 'E', 1:'E', 2:'B', 3:'P'}
    header = "#Freq\tTotal Weight\tTarget Weight\t"
    header += "\t".join(["Gene set %s (k=%s)\tPhi %s\tWeight function %s" % (i, ks[i-1], i, i) for i in range(1, len(ks)+1)])
    tbl = [header]
    for S in collections:
        data = results[S]
        row = [ data["freq"], data["total_weight"], format(data["target_weight"], 'g') ]
        for d in sorted(data["sets"], key=lambda d: d["W"]):
            row += [", ".join(sorted(d["genes"])), d["prob"], weight_func_mapping[d["num_tbls"]] ]
        tbl.append("\t".join(map(str, row)))

    outputFile = "%s.tsv" % iter_num(args.output_prefix + '.sum', N*(runNum), ks, args.accelerator)
    with open(outputFile, "w") as outfile: outfile.write( "\n".join(tbl) )

    return [ (S, results[S]["freq"], results[S]["total_weight"]) for S in collections ]
Beispiel #5
0
def run( args ):

    # Set up the arguments for a general CoMEt run on real data
    realOutputDir = "{}/comet-results".format(args.output_directory)
    realCometArgs = []
    permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-o"]
    for i, arg in enumerate(sys.argv[1:]):
        if arg not in permuteFlags and sys.argv[i] not in permuteFlags:
            realCometArgs.append( arg )

    realCometArgs += [ "-o", realOutputDir, "--noviz"]
    # perform simple run without viz first.
    results = runComet(realCometArgs)

    # Load mutation data using Multi-Dendrix and output as a temporary file
    realMutations = C.load_mutation_data(args.mutation_matrix, args.patient_file,
                                     args.gene_file, args.min_freq, args.subtype)
    m, n, genes, patients, geneToCases, patientToGenes, subtypes = realMutations

    if args.verbose:
        print('* Mutation data: %s genes x %s patients' % (m, n))

    # Construct bipartite graph from mutation data
    if args.verbose: print("* Creating bipartite graph...")
    G = C.construct_mutation_graph(geneToCases, patientToGenes)
    if args.verbose:
        print('\t- Graph has', len( G.edges() ), 'edges among', len( G.nodes() ), 'nodes.')

    # reset the arguments for a general CoMEt run on permuted matrices
    cometArgs = []
    permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-m", "-o"]
    for i, arg in enumerate(sys.argv[1:]):
        if arg not in permuteFlags and sys.argv[i] not in permuteFlags:
            cometArgs.append( arg )

    cometArgs.append('--noviz')
    # Create a permuted matrix, and then run it through CoMEt
    import tempfile
    arguments = []
    if args.keep_temp_files:
        directory = args.output_directory
    else:
        directory = tempfile.mkdtemp(dir=".", prefix=".tmp")

    # Generate random seeds for each permutation
    random.seed(args.seed)
    seeds = [ random.randint(0, 2**31-1) for _ in range(args.num_permutations) ]

    for i, seed in enumerate(seeds):
        # Print simple progress bar
        sys.stdout.write("* Running CoMEt on permuted matrices... {}/{}\r".format(i+1, args.num_permutations))
        sys.stdout.flush()

        # Create a permuted dataset and save it a temporary file
        mutations = C.permute_mutation_data(G, genes, patients, seed, args.Q)
        _, _, _, _, geneToCases, patientToGenes = mutations
        adj_list = [ p + "\t" + "\t".join( sorted(patientToGenes[p]) ) for p in patients ]

        permutation_file = "{}/permuted-matrix-{}.m2".format(directory, i+1)
        with open(permutation_file, 'w') as outfile: outfile.write('\n'.join(adj_list))

        # Add the new arguments
        permuteArgs = list(map(str, cometArgs))
        permuteArgs += [ "-m", permutation_file ]
        permuteArgs += [ "-o", "{}/comet-results-on-permutation-{}".format(directory, i+1)]
        arguments.append( permuteArgs )

    if args.parallel:
        pool = mp.Pool(25)
        results = pool.map(runComet, arguments)
        pool.close()
        pool.join()
    else:
        results = [ runComet(permuteArgs) for permuteArgs in arguments ]

    # Find the maximum test statistic on the permuted datasets
    from itertools import islice
    maxStat = 0

    for rf in [ rf for rf in os.listdir(directory) if rf.startswith("comet-results-on-permutation") ]:
        for df in [df for df in os.listdir("{}/{}/results".format(directory, rf)  ) if df.endswith(".tsv")]:
            with open("{}/{}/results/{}".format(directory, rf, df)) as infile:
                for line in islice(infile, 1, 2):
                    score = float(line.split("\t")[1])
                    if score > maxStat:
                        maxStat = score

    print("*" * 80)
    print("Number of permutations:", args.num_permutations)
    print("Max statistic:", maxStat)

    # Prepare comet results on real, mutation data, and output directory for viz
    for rf in [rf for rf in os.listdir( "{}/results/".format(realOutputDir) ) if rf.endswith(".tsv")]:
        resultsTable = [l.rstrip() for l in open( "{}/results/{}".format(realOutputDir, rf))]

    realMutations = (m, n, genes, patients, geneToCases, patientToGenes )
    outputDirViz = realOutputDir + "/viz/"
    C.ensure_dir(outputDirViz)

    # Perform visualization
    C.output_comet_viz(RC.get_parser().parse_args(realCometArgs), realMutations,
        resultsTable, maxStat, args.num_permutations)

    # Destroy the temporary directory if necessary
    if not args.keep_temp_files:
        import shutil
        shutil.rmtree(directory)
Beispiel #6
0
def run(args):
    ###########################################################################
    # Parse the arguments into shorter variable handles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    subtypeFile = args.subtype
    rc = args.num_initial
    t = len(args.gene_set_sizes)  # number of pathways
    ks = args.gene_set_sizes  # size of each pathway
    N = args.num_iterations  # number of iteration
    s = args.step_length  # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5  # increamental for non-converged chain

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile,
                                     minFreq, subtypeFile)
    m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations
    mutations = (m, n, genes, patients, geneToCases, patientToGenes)

    ###########################################################################
    if args.verbose:
        print(f'Mutation data: {m} genes x {n} patients')

    if args.core_events:
        with open(args.core_events) as f:
            subSet = list(subtypes.union(set([l.rstrip() for l in f])))
    else:
        subSet = list(subtypes)

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        C.load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \
            mutations, ks, args.initial_soln, subSet, \
            importMultidendrix, multi_dendrix)
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \
                    init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
                C.merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N / s))
            print(finalTv, N)

            newN = int(N * NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = C.merge_results(totalOut)

    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, \
            init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
        results = outresults
        runNum = 1

    C.free_factorials()

    # Output comet results to TSV and website
    collections = sorted(results.keys(),
                         key=lambda S: results[S]["total_weight"],
                         reverse=True)
    C.output_comet(args, mutations, results, collections, ks, N * (runNum), 0,
                   0)

    return [(S, results[S]["freq"], results[S]["total_weight"])
            for S in collections]
Beispiel #7
0
def run( args ):
	###########################################################################
	# Parse the arguments into shorter variable handles
	mutationMatrix = args.mutation_matrix
	geneFile       = args.gene_file
	patientFile    = args.patient_file
	eventNamesFile = args.event_names
	minFreq        = args.min_freq
	msf            = args.minimum_sampling_frequency
	inputFile      = args.input_file
	statsFile      = args.comet_stats_file
	sec            = args.standard_error_cutoff
	mew            = args.minimum_edge_weight

	# Load the mutation data
	mutations  = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq)
	m, n, genes, patients, geneToCases, patientToGenes = mutations
	eventNames = load_event_names(eventNamesFile, genes)

	###########################################################################
	# Compute max weight from random data if users provide random data.
	# Otherwise, maxPermutedWeight = 0
	if statsFile:
                with open(statsFile) as f:
                    obj = json.load(f)
                    maxPermutedWeight, N = obj['maxPermutedWeight'], obj['numPermutations']
	else:
		maxPermutedWeight, N = 0, 0

	###########################################################################
	# Construct marginal probability graph from the input CoMEt results file
	if args.verbose: print "* Constructing marginal probability graph..."

	res = construct_mp_graph( inputFile, eventNames,  msf, maxPermutedWeight )
	MPG, tables, passPoint = res
	edges = MPG.edges(data=True)

	if args.verbose: print "\t- Edges:", len(edges)

	# Choose delta (the minimum edge weight in the marginal probability 
	# graph ) using a heuristic approach that selects delta at first elbow
	# with slope change > 0 using linear regression
	if args.verbose: print "* Choosing delta..."

	deltas = sorted(set( d['weight'] for u, v, d in MPG.edges(data=True)))
	realEdgeDist = compute_edge_dist(MPG, deltas)
	deltaPoint, edgeno = choose_delta(deltas, realEdgeDist, passPoint, sec)

	if args.verbose: print "\t- Delta: ", deltaPoint

	###########################################################################
	# Create the web output

	# Dictionary of web output
	obj = {
			"N": N,
			"mm": ['max-derivative'],
			"deltas": deltas,
			"edge_dist": realEdgeDist,
			"collections": {
			   	"max-derivative": {
			   		"more_extreme": 0,
			   		"pval": [0],
				   	"components": list(),
				   	"delta": deltaPoint
				}
			}
	}

	# TO-DO: HSIN-TA: Please comment, I have no idea what this does!
	collections = obj["collections"]
	deltas = [ dict(delta=collections[m]["delta"], pval=collections[m]["pval"], method=m,
			   cdelta=min(obj["deltas"], key=lambda x:abs(x-collections[m]["delta"])))
			   for m in obj["mm"] ]
	plot=None

	# Write the delta plot to file as an SVG, then load
	# it so we can embed it in the web page
	if args.verbose: print "* Plotting delta curve..."
	tmp = tempfile.mktemp(".svg", dir=".", prefix=".tmp")
	delta_plot(obj, tmp, passPoint, deltaPoint, edgeno)

	# Read in the graph, skipping the first four lines that are
	# extraneous header information that will only confuse a webpage
	with open(tmp) as f: plot = "".join(f.readlines()[4:])
	os.unlink(tmp)
	stats = dict(deltas=deltas, plot=plot, N=N)

	# Combine everything to create the D3 data
	if args.verbose: print "* Creating GD3 data..."
	graphData = gd3_graph(MPG, eventNames, mew)
	genesInResults = MPG.nodes()
	sampleToType = None
	if args.sample_types_file:
		with open(args.sample_types_file) as f:
			sampleToType = dict( l.rstrip().split("\t") for l in f )
	mutations = gd3_mutation_data(*mutations, genespace=genesInResults,
								  eventNames=eventNames, sampleToType=sampleToType)

	# Output the results to an HTML file
	if args.verbose: print "* Outputting..."
	htmlOutput = "{}/index.html".format(args.output_directory)
	with open(args.template_file) as template, open(htmlOutput, "w") as outfile:
		jsonData = json.dumps( dict(graph=graphData, mutations=mutations, tables=tables, stats=stats))
		html = template.read()
		html += "\n<script>\nvar data = {};\ndplusViz(data);\n</script>\n".format(jsonData)
		outfile.write( html )

	# Then copy the required JS and CSS files
	import shutil
	shutil.copyfile("comet/src/js/comet-viz.js", "{}/comet-viz.js".format(args.output_directory))
	shutil.copyfile("comet/src/js/mp-graph.js", "{}/mp-graph.js".format(args.output_directory))
	shutil.copyfile("comet/src/js/bower.json", "{}/bower.json".format(args.output_directory))
	shutil.copyfile("comet/src/css/style.css", "{}/style.css".format(args.output_directory))
Beispiel #8
0
def run(args):
    # Load mutation data using Multi-Dendrix and output as a temporary file
    mutations = C.load_mutation_data(args.mutation_matrix, args.patient_file,
                                     args.gene_file, args.min_freq)
    m, n, genes, patients, geneToCases, patientToGenes = mutations

    if args.verbose:
        print '* Mutation data: %s genes x %s patients' % (m, n)

    # Construct bipartite graph from mutation data
    if args.verbose: print "* Creating bipartite graph..."
    G = C.construct_mutation_graph(geneToCases, patientToGenes)
    if args.verbose:
        print '\t- Graph has', len(G.edges()), 'edges among', len(
            G.nodes()), 'nodes.'

    # Set up the arguments for a general CoMEt run
    cometArgs = []
    permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-m", "-o"]
    for i, arg in enumerate(sys.argv[1:]):
        if arg not in permuteFlags and sys.argv[i] not in permuteFlags:
            cometArgs.append(arg)

    # Create a permuted matrix, and then run it through CoMEt
    import tempfile
    arguments = []
    if args.keep_temp_files:
        directory = args.output_directory
    else:
        directory = tempfile.mkdtemp(dir=".", prefix=".tmp")

    # Generate random seeds for each permutation
    random.seed(args.seed)
    seeds = [
        random.randint(0, 2**31 - 1) for _ in range(args.num_permutations)
    ]

    for i, seed in enumerate(seeds):
        # Print simple progress bar
        sys.stdout.write(
            "* Running CoMEt on permuted matrices... {}/{}\r".format(
                i + 1, args.num_permutations))
        sys.stdout.flush()

        # Create a permuted dataset and save it a temporary file
        mutations = C.permute_mutation_data(G, genes, patients, seed, args.Q)
        _, _, _, _, geneToCases, patientToGenes = mutations
        adj_list = [
            p + "\t" + "\t".join(sorted(patientToGenes[p])) for p in patients
        ]

        permutation_file = "{}/permuted-matrix-{}.m2".format(directory, i + 1)
        with open(permutation_file, 'w') as outfile:
            outfile.write('\n'.join(adj_list))

        # Add the new arguments
        permuteArgs = map(str, cometArgs)
        permuteArgs += ["-m", permutation_file]
        permuteArgs += [
            "-o",
            "{}/comet-results-on-permutation-{}".format(directory, i + 1)
        ]
        arguments.append(permuteArgs)

    if args.parallel:
        pool = mp.Pool(25)
        results = pool.map(runComet, arguments)
        pool.close()
        pool.join()
    else:
        results = [runComet(permuteArgs) for permuteArgs in arguments]

    # Find the maximum test statistic on the permuted datasets
    from itertools import islice
    maxStat = 0
    for rf in [
            rf for rf in os.listdir(directory)
            if rf.startswith("comet-results")
    ]:
        with open("{}/{}".format(directory, rf)) as infile:
            for line in islice(infile, 1, 2):
                score = float(line.split("\t")[1])
                if score > maxStat:
                    maxStat = score

    print "*" * 80
    print "Number of permutations:", args.num_permutations
    print "Max statistic:", maxStat

    # Output the results to files
    with open("{}/comet-stats.json".format(args.output_directory),
              "w") as outfile:
        output = dict(maxPermutedWeight=maxStat,
                      numPermutations=args.num_permutations,
                      keepTempFiles=args.keep_temp_files,
                      mutationNatrix=args.mutation_matrix,
                      geneFile=args.gene_file,
                      patientFile=args.patient_file,
                      minFreq=args.min_freq,
                      Q=args.Q)
        json.dump(output, outfile, sort_keys=True, indent=4)

    # Destroy the temporary directory if necessary
    if not args.keep_temp_files:
        import shutil
        shutil.rmtree(directory)
Beispiel #9
0
def run( args ):
	###########################################################################
	# Parse the arguments into shorter variable handles
	mutationMatrix = args.mutation_matrix
	geneFile       = args.gene_file
	patientFile    = args.patient_file
	eventNamesFile = args.event_names
	minFreq        = args.min_freq
	msf            = args.minimum_sampling_frequency
	inputFile      = args.input_file
	statsFile      = args.comet_stats_file
	sec            = args.standard_error_cutoff
	mew            = args.minimum_edge_weight

	# Load the mutation data
	mutations  = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq)
	m, n, genes, patients, geneToCases, patientToGenes = mutations
	eventNames = load_event_names(eventNamesFile, genes)

	###########################################################################
	# Compute max weight from random data if users provide random data.
	# Otherwise, maxPermutedWeight = 0
	if statsFile:
                with open(statsFile) as f:
                    obj = json.load(f)
                    maxPermutedWeight, N = obj['maxPermutedWeight'], obj['numPermutations']
	else:
		maxPermutedWeight, N = 0, 0

	###########################################################################
	# Construct marginal probability graph from the input CoMEt results file
	if args.verbose: print "* Constructing marginal probability graph..."

	res = construct_mp_graph( inputFile, eventNames,  msf, maxPermutedWeight )
	MPG, tables, expectedPoint = res
	edges = MPG.edges(data=True)

	if len(edges) == 0: # no significant results
		print "No significant collection. "
		exit(1)
		
	if args.verbose: print "\t- Edges:", len(edges)

	# Choose delta (the minimum edge weight in the marginal probability 
	# graph ) using a heuristic approach that selects delta at first elbow
	# with slope change > 0 using linear regression
	if args.verbose: print "* Choosing delta..."

	deltas = sorted(set( d['weight'] for u, v, d in MPG.edges(data=True)))
	realEdgeDist = compute_edge_dist(MPG, deltas)
	deltaPoint, edgeno = choose_delta(deltas, realEdgeDist, expectedPoint, sec)

	if args.verbose: print "\t- Delta: ", deltaPoint

	###########################################################################
	# Create the web output
	plot=None

	# Write the delta plot to file as an SVG, then load
	# it so we can embed it in the web page
	if args.verbose: print "* Plotting delta curve..."
	tmp = tempfile.mktemp(".svg", dir=".", prefix=".tmp")
	delta_plot(N, deltas, realEdgeDist, tmp, expectedPoint, deltaPoint, edgeno)

	# Read in the graph, skipping the first four lines that are
	# extraneous header information that will only confuse a webpage
	with open(tmp) as f: plot = "".join(f.readlines()[4:])
	os.unlink(tmp)
	stats = dict(deltas=[dict(delta=deltaPoint, pval=0.)], plot=plot, N=N)

	# Combine everything to create the D3 data
	if args.verbose: print "* Creating GD3 data..."
	graphData = gd3_graph(MPG, eventNames, mew)
	genesInResults = MPG.nodes()
	sampleToType = None
	if args.sample_types_file:
		with open(args.sample_types_file) as f:
			sampleToType = dict( l.rstrip().split("\t") for l in f )
	mutations = gd3_mutation_data(*mutations, genespace=genesInResults,
								  eventNames=eventNames, sampleToType=sampleToType)

	# Output the results to an HTML file
	if args.verbose: print "* Outputting..."
	htmlOutput = "{}/index.html".format(args.output_directory)
	with open(args.template_file) as template, open(htmlOutput, "w") as outfile:
		jsonData = json.dumps( dict(graph=graphData, mutations=mutations, tables=tables, stats=stats))
		html = template.read()
		html += "\n<script>\nvar data = {};\ndplusViz(data);\n</script>\n".format(jsonData)
		outfile.write( html )

	# Then copy the required JS and CSS files
	import shutil
	shutil.copyfile("comet/src/js/comet-viz.js", "{}/comet-viz.js".format(args.output_directory))
	shutil.copyfile("comet/src/js/mp-graph.js", "{}/mp-graph.js".format(args.output_directory))
	shutil.copyfile("comet/src/js/bower.json", "{}/bower.json".format(args.output_directory))
	shutil.copyfile("comet/src/css/style.css", "{}/style.css".format(args.output_directory))
Beispiel #10
0
def run( args ):
    # Load mutation data using Multi-Dendrix and output as a temporary file
    mutations = C.load_mutation_data(args.mutation_matrix, args.patient_file,
                                     args.gene_file, args.min_freq)
    m, n, genes, patients, geneToCases, patientToGenes = mutations

    if args.verbose:
        print '* Mutation data: %s genes x %s patients' % (m, n)

    # Construct bipartite graph from mutation data
    if args.verbose: print "* Creating bipartite graph..."
    G = C.construct_mutation_graph(geneToCases, patientToGenes)
    if args.verbose:
        print '\t- Graph has', len( G.edges() ), 'edges among', len( G.nodes() ), 'nodes.'
        
    # Set up the arguments for a general CoMEt run 
    cometArgs = []
    permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-m", "-o"]
    for i, arg in enumerate(sys.argv[1:]):
        if arg not in permuteFlags and sys.argv[i] not in permuteFlags:
            cometArgs.append( arg )

    # Create a permuted matrix, and then run it through CoMEt
    import tempfile
    arguments = []
    if args.keep_temp_files:
        directory = args.output_directory
    else:
        directory = tempfile.mkdtemp(dir=".", prefix=".tmp")

    for i in range(args.num_permutations):
        # Print simple progress bar
        sys.stdout.write("* Running CoMEt on permuted matrices... {}/{}\r".format(i+1, n))
        sys.stdout.flush()

        # Create a permuted dataset and save it a temporary file
        mutations = C.permute_mutation_data(G, genes, patients, args.seed, args.Q)
        _, _, _, _, geneToCases, patientToGenes = mutations
        adj_list = [ p + "\t" + "\t".join( sorted(patientToGenes[p]) ) for p in patients ]
        
	permutation_file = "{}/permuted-matrix-{}.m2".format(directory, i+1)
        with open(permutation_file, 'w') as outfile: outfile.write('\n'.join(adj_list))
        
        # Add the new arguments
        permuteArgs = map(str, cometArgs)
        permuteArgs += [ "-m", permutation_file ]
        permuteArgs += [ "-o", "{}/comet-results-on-permutation-{}".format(directory, i+1)]
        arguments.append( permuteArgs )

    if args.parallel:
        pool = mp.Pool(25)
        results = pool.map(runComet, arguments)
        pool.close()
        pool.join()
    else:
        results = [ runComet(permuteArgs) for permuteArgs in arguments ]

    # Find the maximum test statistic on the permuted datasets
    from itertools import islice
    maxStat = 0
    for rf in [ rf for rf in os.listdir(directory) if rf.startswith("comet-results") ]:
        with open("{}/{}".format(directory, rf)) as infile:
    	    for line in islice(infile, 1, 2):
                score = float(line.split("\t")[1])
                if score > maxStat:
                    maxStat = score

    print "*" * 80
    print "Number of permutations:", args.num_permutations
    print "Max statistic:", maxStat

    # Output the results to files
    with open("{}/comet-stats.json".format(args.output_directory), "w") as outfile:
        output = dict(maxPermutedWeight=maxStat,
                      numPermutations=args.num_permutations,
                      keepTempFiles=args.keep_temp_files,
                      mutationNatrix=args.mutation_matrix,
                      geneFile=args.gene_file, patientFile=args.patient_file,
                      minFreq=args.min_freq, Q=args.Q)
        json.dump( output, outfile, sort_keys=True, indent=4)

    # Destroy the temporary directory if necessary
    if not args.keep_temp_files:
        import shutil
        shutil.rmtree(directory)
Beispiel #11
0
def run(args):

    # Set up the arguments for a general CoMEt run on real data
    realOutputDir = "{}/comet-results".format(args.output_directory)
    realCometArgs = []
    permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-o"]
    for i, arg in enumerate(sys.argv[1:]):
        if arg not in permuteFlags and sys.argv[i] not in permuteFlags:
            realCometArgs.append(arg)

    realCometArgs += ["-o", realOutputDir, "--noviz"]
    # perform simple run without viz first.
    results = runComet(realCometArgs)

    # Load mutation data using Multi-Dendrix and output as a temporary file
    realMutations = C.load_mutation_data(args.mutation_matrix,
                                         args.patient_file, args.gene_file,
                                         args.min_freq, args.subtype)
    m, n, genes, patients, geneToCases, patientToGenes, subtypes = realMutations

    if args.verbose:
        print(f'* Mutation data: {m} genes x {n} patients')

    # Construct bipartite graph from mutation data
    if args.verbose: print('* Creating bipartite graph...')
    G = C.construct_mutation_graph(geneToCases, patientToGenes)
    if args.verbose:
        print('\t- Graph has', len(G.edges()), 'edges among', len(G.nodes()),
              'nodes.')

    # reset the arguments for a general CoMEt run on permuted matrices
    cometArgs = []
    permuteFlags = ["-np", "--parallel", "--keep_temp_files", "-m", "-o"]
    for i, arg in enumerate(sys.argv[1:]):
        if arg not in permuteFlags and sys.argv[i] not in permuteFlags:
            cometArgs.append(arg)

    cometArgs.append('--noviz')
    # Create a permuted matrix, and then run it through CoMEt
    import tempfile
    arguments = []
    if args.keep_temp_files:
        directory = args.output_directory
    else:
        directory = tempfile.mkdtemp(dir=".", prefix=".tmp")

    # Generate random seeds for each permutation
    random.seed(args.seed)
    seeds = [
        random.randint(0, 2**31 - 1) for _ in range(args.num_permutations)
    ]

    for i, seed in enumerate(seeds):
        # Print simple progress bar
        sys.stdout.write(
            "* Running CoMEt on permuted matrices... {}/{}\r".format(
                i + 1, args.num_permutations))
        sys.stdout.flush()

        # Create a permuted dataset and save it a temporary file
        mutations = C.permute_mutation_data(G, genes, patients, seed, args.Q)
        _, _, _, _, geneToCases, patientToGenes = mutations
        adj_list = [
            p + "\t" + "\t".join(sorted(patientToGenes[p])) for p in patients
        ]

        permutation_file = "{}/permuted-matrix-{}.m2".format(directory, i + 1)
        with open(permutation_file, 'w') as outfile:
            outfile.write('\n'.join(adj_list))

        # Add the new arguments
        permuteArgs = list(map(str, cometArgs))
        permuteArgs += ["-m", permutation_file]
        permuteArgs += [
            "-o",
            "{}/comet-results-on-permutation-{}".format(directory, i + 1)
        ]
        arguments.append(permuteArgs)

    if args.parallel:
        pool = mp.Pool(25)
        results = pool.map(runComet, arguments)
        pool.close()
        pool.join()
    else:
        results = [runComet(permuteArgs) for permuteArgs in arguments]

    # Find the maximum test statistic on the permuted datasets
    from itertools import islice
    maxStat = 0

    for rf in [
            rf for rf in os.listdir(directory)
            if rf.startswith("comet-results-on-permutation")
    ]:
        for df in [
                df for df in os.listdir("{}/{}/results".format(directory, rf))
                if df.endswith(".tsv")
        ]:
            with open("{}/{}/results/{}".format(directory, rf, df)) as infile:
                for line in islice(infile, 1, 2):
                    score = float(line.split("\t")[1])
                    if score > maxStat:
                        maxStat = score

    print("*" * 80)
    print("Number of permutations:", args.num_permutations)
    print("Max statistic:", maxStat)

    # Prepare comet results on real, mutation data, and output directory for viz
    for rf in [
            rf for rf in os.listdir("{}/results/".format(realOutputDir))
            if (not rf.startswith('.') and rf.endswith(".tsv"))
    ]:
        resultsTable = [
            l.rstrip() for l in open("{}/results/{}".format(realOutputDir, rf))
        ]

    realMutations = (m, n, genes, patients, geneToCases, patientToGenes)
    outputDirViz = realOutputDir + "/viz/"
    C.ensure_dir(outputDirViz)

    # Perform visualization
    C.output_comet_viz(RC.get_parser().parse_args(realCometArgs), realMutations, \
        resultsTable, maxStat, args.num_permutations)

    # Destroy the temporary directory if necessary
    if not args.keep_temp_files:
        import shutil
        shutil.rmtree(directory)
Beispiel #12
0
def run( args ):
    ###########################################################################
    # Parse the arguments into shorter variable handles    
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    subtypeFile = args.subtype
    rc    = args.num_initial
    t     = len(args.gene_set_sizes) # number of pathways
    ks    = args.gene_set_sizes      # size of each pathway
    N     = args.num_iterations      # number of iteration
    s     = args.step_length         # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5                 # increamental for non-converged chain    

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq, subtypeFile)
    m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations
    mutations = ( m, n, genes, patients, geneToCases, patientToGenes )


    ###########################################################################
    if args.verbose:
        print('Mutation data: %s genes x %s patients' % (m, n))

    if args.core_events:
        with open(args.core_events) as f:
            subSet = list( subtypes.union( set( [ l.rstrip() for l in f ] ) ) )
    else:
        subSet = list( subtypes )

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        C.load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \
            mutations, ks, args.initial_soln, subSet, \
            importMultidendrix, multi_dendrix)
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \
                    init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)                
                C.merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N/s))
            print(finalTv, N)

            newN = int(N*NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = C.merge_results(totalOut)
        
    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, \
            init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
        results = outresults
        runNum = 1

    C.free_factorials()

    # Output comet results to TSV and website
    collections = sorted(results, key=lambda S: results[S]["total_weight"], reverse=True)
    C.output_comet(args, mutations, results, collections, ks, N*(runNum), 0, 0)
    
    return [ (S, results[S]["freq"], results[S]["total_weight"]) for S in collections ]