Beispiel #1
0
def load_precomputed_scores(infile, mutations, subt):

    if subt: mutations = mutations + (subt,)
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations

    baseI = 3  # sampling freq., total weight, target weight
    setI = 3 # gene set, score, weight function

    matchObj = re.match( r'.+\.k(\d+)\..+?', infile)

    loadingT = len(matchObj.group(1)) # determine t:the number of gene sets.
    for l in open(infile):
        if not l.startswith("#"):
            v = l.rstrip().split("\t")
            j = 0
            for i in range(loadingT):
                gSet = [geneToIndex[g] for g in v[baseI + j].split(", ")]
                C.load_precomputed_scores(float(v[baseI + j + 1]), len(v[baseI + j].split(", ")), int(v[baseI + j + 2]), gSet)
                j += setI
Beispiel #2
0
def run(args):
    ###########################################################################
    # Parse the arguments into shorter variable handles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    subtypeFile = args.subtype
    rc = args.num_initial
    t = len(args.gene_set_sizes)  # number of pathways
    ks = args.gene_set_sizes  # size of each pathway
    N = args.num_iterations  # number of iteration
    s = args.step_length  # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5  # increamental for non-converged chain

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile,
                                     minFreq, subtypeFile)
    m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations
    mutations = (m, n, genes, patients, geneToCases, patientToGenes)

    ###########################################################################
    if args.verbose:
        print(f'Mutation data: {m} genes x {n} patients')

    if args.core_events:
        with open(args.core_events) as f:
            subSet = list(subtypes.union(set([l.rstrip() for l in f])))
    else:
        subSet = list(subtypes)

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        C.load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \
            mutations, ks, args.initial_soln, subSet, \
            importMultidendrix, multi_dendrix)
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \
                    init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
                C.merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N / s))
            print(finalTv, N)

            newN = int(N * NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = C.merge_results(totalOut)

    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, \
            init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
        results = outresults
        runNum = 1

    C.free_factorials()

    # Output comet results to TSV and website
    collections = sorted(results.keys(),
                         key=lambda S: results[S]["total_weight"],
                         reverse=True)
    C.output_comet(args, mutations, results, collections, ks, N * (runNum), 0,
                   0)

    return [(S, results[S]["freq"], results[S]["total_weight"])
            for S in collections]
Beispiel #3
0
def run( args ):
    ###########################################################################
    # Parse the arguments into shorter variable handles    
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    subtypeFile = args.subtype
    rc    = args.num_initial
    t     = len(args.gene_set_sizes) # number of pathways
    ks    = args.gene_set_sizes      # size of each pathway
    N     = args.num_iterations      # number of iteration
    s     = args.step_length         # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5                 # increamental for non-converged chain    

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq, subtypeFile)
    m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations
    mutations = ( m, n, genes, patients, geneToCases, patientToGenes )


    ###########################################################################
    if args.verbose:
        print('Mutation data: %s genes x %s patients' % (m, n))

    if args.core_events:
        with open(args.core_events) as f:
            subSet = list( subtypes.union( set( [ l.rstrip() for l in f ] ) ) )
    else:
        subSet = list( subtypes )

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        C.load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \
            mutations, ks, args.initial_soln, subSet, \
            importMultidendrix, multi_dendrix)
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \
                    init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)                
                C.merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N/s))
            print(finalTv, N)

            newN = int(N*NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = C.merge_results(totalOut)
        
    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, \
            init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
        results = outresults
        runNum = 1

    C.free_factorials()

    # Output comet results to TSV and website
    collections = sorted(results, key=lambda S: results[S]["total_weight"], reverse=True)
    C.output_comet(args, mutations, results, collections, ks, N*(runNum), 0, 0)
    
    return [ (S, results[S]["freq"], results[S]["total_weight"]) for S in collections ]