def run(args): mdictfile = args.mdictfile cdictfile = args.cdictfile mprob = args.mprob cprob = args.cprob cooccur_distance_threshold = args.cooccur_distance_threshold bin_distance_threshold = args.bin_distance_threshold mutationmatrix = args.mutation_matrix newmutationmatrix = args.newmutationmatrix file_prefix = args.output_prefix if not file_prefix: file_prefix = newmutationmatrix geneFile = args.gene_file patientFile = args.patient_file gene_blacklist = args.gene_blacklist_file patient_blacklist = args.patient_blacklist_file minFreq = args.min_freq minCooccur = args.min_cooccur min_cooccurrence_ratio = args.min_cooccurrence_ratio top_percentile = args.top_percentile top_number = args.top_number parallel_compute_number = args.parallel_compute_number filter_cooccur_same_segment = args.filter_cooccur_same_segment fcss_cratiothresh = args.fcss_cratiothresh fcss_mutfreqdiffthresh = args.fcss_mutfreqdiffthresh fcss_mutfreqdiffratiothresh = args.fcss_mutfreqdiffratiothresh fcss_coveragethresh = args.fcss_coveragethresh fcss_probabilitythresh = args.fcss_probabilitythresh gene_segment_file = args.gene_segment_file load_gene_segments = args.load_gene_segments is_gene2seg = args.is_gene2seg gene_bin_entries_file = args.gene_bin_entries_file no_throw_out_extras = args.no_throw_out_extras segment_info_file = args.segment_info_file if not gene_bin_entries_file: gene_bin_entries_file = file_prefix + '_binnedgenes.tsv' if not segment_info_file: segment_info_file = file_prefix + '_SEGMENTINFO.tsv' #----------------------------------------------------- mutations = mex.remove_blacklists(gene_blacklist, patient_blacklist, *mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mutations print 'Filtered Mutation data: %s genes x %s patients' % (numGenes, numCases) # Load segment info if load_gene_segments: # extra_genes is the genes not found in the segment file. # If throw_out_extras is False, extra_genes will be empty. geneToBin, extra_genes = load_gene_to_bin(gene_segment_file, geneToCases, no_throw_out_extras=no_throw_out_extras, is_gene2seg=is_gene2seg) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.remove_extra_genes(extra_genes, numGenes, numCases, genes, patients, geneToCases, patientToGenes) else: print "Beginning bin genes by co-occurring pairs. " genepairs = getgenepairs(geneToCases, genes, closer_than_distance=bin_distance_threshold) print "Pairs retrieved. Calculating cooccurring pairs to make bins." cpairsdict, cgenedict = met.complete_cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, fcss_probabilitythresh, minCooccur, cooccur_distance_threshold, fcss_cratiothresh, parallel_compute_number, filter_cooccur_same_segment, fcss_cratiothresh, fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh) print "Cooccurring pairs calculated." geneToBin = get_gene_bins_cooccur_same_segment(cpairsdict, geneToCases, fcss_cratiothresh, fcss_mutfreqdiffthresh, fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh, bin_distance_threshold=bin_distance_threshold) # Write these new bins out new_bins = convert_genes_to_bins(genes, geneToBin) write_segment_infos(new_bins, filename=segment_info_file) print "New SEGMENTINFO written to ", segment_info_file write_gene_positions(new_bins) print "New segment positions appended to gene_positions.txt" # Update to the new mutation matrix. geneToBinSet, bin_setToBin = bin_sets_from_geneToBin(genes, geneToBin) newGeneToCases, newPatientToGenes = update_geneToCases_patientToGenes(geneToCases, patientToGenes, bin_setToBin, at_least_half=True) gene_bin_entries = get_gene_bin_entries(geneToCases, newGeneToCases, geneToBinSet, bin_setToBin) if gene_bin_entries_file: met.writeanydict(gene_bin_entries, gene_bin_entries_file) print "Gene bin entries written to ", gene_bin_entries_file # Write the new mutation matrix out. writemutationmatrix(newPatientToGenes, filename=newmutationmatrix)
def run(args): mdictfile = args.mdictfile cdictfile = args.cdictfile mprob = args.mprob cprob = args.cprob cooccur_distance_threshold = args.cooccur_distance_threshold bin_distance_threshold = args.bin_distance_threshold mutationmatrix = args.mutation_matrix newmutationmatrix = args.newmutationmatrix file_prefix = args.output_prefix if not file_prefix: file_prefix = newmutationmatrix geneFile = args.gene_file patientFile = args.patient_file gene_blacklist = args.gene_blacklist_file patient_blacklist = args.patient_blacklist_file minFreq = args.min_freq minCooccur = args.min_cooccur min_cooccurrence_ratio = args.min_cooccurrence_ratio top_percentile = args.top_percentile top_number = args.top_number parallel_compute_number = args.parallel_compute_number filter_cooccur_same_segment = args.filter_cooccur_same_segment fcss_cratiothresh = args.fcss_cratiothresh fcss_mutfreqdiffthresh = args.fcss_mutfreqdiffthresh fcss_mutfreqdiffratiothresh = args.fcss_mutfreqdiffratiothresh fcss_coveragethresh = args.fcss_coveragethresh fcss_probabilitythresh = args.fcss_probabilitythresh gene_segment_file = args.gene_segment_file load_gene_segments = args.load_gene_segments is_gene2seg = args.is_gene2seg gene_bin_entries_file = args.gene_bin_entries_file no_throw_out_extras = args.no_throw_out_extras segment_info_file = args.segment_info_file if not gene_bin_entries_file: gene_bin_entries_file = file_prefix + '_binnedgenes.tsv' if not segment_info_file: segment_info_file = file_prefix + '_SEGMENTINFO.tsv' #----------------------------------------------------- mutations = mex.remove_blacklists( gene_blacklist, patient_blacklist, *mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mutations print 'Filtered Mutation data: %s genes x %s patients' % (numGenes, numCases) # Load segment info if load_gene_segments: # extra_genes is the genes not found in the segment file. # If throw_out_extras is False, extra_genes will be empty. geneToBin, extra_genes = load_gene_to_bin( gene_segment_file, geneToCases, no_throw_out_extras=no_throw_out_extras, is_gene2seg=is_gene2seg) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.remove_extra_genes( extra_genes, numGenes, numCases, genes, patients, geneToCases, patientToGenes) else: print "Beginning bin genes by co-occurring pairs. " genepairs = getgenepairs(geneToCases, genes, closer_than_distance=bin_distance_threshold) print "Pairs retrieved. Calculating cooccurring pairs to make bins." cpairsdict, cgenedict = met.complete_cooccurpairs( numCases, geneToCases, patientToGenes, genepairs, fcss_probabilitythresh, minCooccur, cooccur_distance_threshold, fcss_cratiothresh, parallel_compute_number, filter_cooccur_same_segment, fcss_cratiothresh, fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh) print "Cooccurring pairs calculated." geneToBin = get_gene_bins_cooccur_same_segment( cpairsdict, geneToCases, fcss_cratiothresh, fcss_mutfreqdiffthresh, fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh, bin_distance_threshold=bin_distance_threshold) # Write these new bins out new_bins = convert_genes_to_bins(genes, geneToBin) write_segment_infos(new_bins, filename=segment_info_file) print "New SEGMENTINFO written to ", segment_info_file write_gene_positions(new_bins) print "New segment positions appended to gene_positions.txt" # Update to the new mutation matrix. geneToBinSet, bin_setToBin = bin_sets_from_geneToBin(genes, geneToBin) newGeneToCases, newPatientToGenes = update_geneToCases_patientToGenes( geneToCases, patientToGenes, bin_setToBin, at_least_half=True) gene_bin_entries = get_gene_bin_entries(geneToCases, newGeneToCases, geneToBinSet, bin_setToBin) if gene_bin_entries_file: met.writeanydict(gene_bin_entries, gene_bin_entries_file) print "Gene bin entries written to ", gene_bin_entries_file # Write the new mutation matrix out. writemutationmatrix(newPatientToGenes, filename=newmutationmatrix)
def main(): mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2' patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst' cpairfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Binomial/OV_broad-cna-jl-cpairs-min_cohort.txt' partitionfile = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf' load_partitions = True do_min_cohort = True geneFile = None minFreq = 0 test_minFreq = 100 compute_mutex = True include_cohort_info = False num_cohorts_list = [1,3, 5, 7] numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq) print "number of genes is ", numGenes if do_min_cohort: cohort_dict, clusterToProp, min_cohort = partition.load_patient_cohorts(partitionfile, patientToGenes) min_cohort_genes = set.union(*(patientToGenes[p] for p in min_cohort)) print "getting pairs" genepairs = met.getgenepairs(geneToCases, min_cohort_genes, test_minFreq=test_minFreq) print "Number of pairs ", len(genepairs) print "Normal cooccur test" t = time.time() cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex) print "Normal cooccur done in ", time.time() - t print "Beginning cohorts" t = time.time() cpairsdict = add_BinomP_min_cohort_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict, min_cohort) print "Cohorts done in ", time.time() - t else: genepairs = met.getgenepairs(geneToCases, genes, test_minFreq=test_minFreq) print "Number of pairs ", len(genepairs) print "Normal cooccur test" cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex) # print "Add binomial probability" # cpairsdict = add_BinomP_all_pairs(cpairsdict, geneToCases, patientToGenes) # undo print "Beginning cohorts" if load_partitions: cohort_dict = partition.load_patient_cohorts(partitionfile) cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) else: for num_cohorts in num_cohorts_list: # get cohorts cohort_dict = generate_patient_cohorts(patientToGenes, num_cohorts) cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) if include_cohort_info: cpairsdict = add_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) print "Writing to file..." met.writeanydict(cpairsdict, cpairfile)
def main(): mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2' patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst' cpairfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Binomial/OV_broad-cna-jl-cpairs-min_cohort.txt' partitionfile = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf' load_partitions = True do_min_cohort = True geneFile = None minFreq = 0 test_minFreq = 100 compute_mutex = True include_cohort_info = False num_cohorts_list = [1, 3, 5, 7] numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data( mutationmatrix, patientFile, geneFile, minFreq) print "number of genes is ", numGenes if do_min_cohort: cohort_dict, clusterToProp, min_cohort = partition.load_patient_cohorts( partitionfile, patientToGenes) min_cohort_genes = set.union(*(patientToGenes[p] for p in min_cohort)) print "getting pairs" genepairs = met.getgenepairs(geneToCases, min_cohort_genes, test_minFreq=test_minFreq) print "Number of pairs ", len(genepairs) print "Normal cooccur test" t = time.time() cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex) print "Normal cooccur done in ", time.time() - t print "Beginning cohorts" t = time.time() cpairsdict = add_BinomP_min_cohort_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict, min_cohort) print "Cohorts done in ", time.time() - t else: genepairs = met.getgenepairs(geneToCases, genes, test_minFreq=test_minFreq) print "Number of pairs ", len(genepairs) print "Normal cooccur test" cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex) # print "Add binomial probability" # cpairsdict = add_BinomP_all_pairs(cpairsdict, geneToCases, patientToGenes) # undo print "Beginning cohorts" if load_partitions: cohort_dict = partition.load_patient_cohorts(partitionfile) cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) else: for num_cohorts in num_cohorts_list: # get cohorts cohort_dict = generate_patient_cohorts(patientToGenes, num_cohorts) cpairsdict = add_BinomP_cohorts_all_pairs( cpairsdict, geneToCases, patientToGenes, cohort_dict) if include_cohort_info: cpairsdict = add_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) print "Writing to file..." met.writeanydict(cpairsdict, cpairfile)