def run(args): infmat = scipy.io.loadmat(args.infmat_file)[args.infmat_name] infmat_index = hnio.load_index(args.infmat_index_file) heat, heat_params = hnio.load_heat_json(args.heat_file) if args.perm_type == "heat": addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None deltas = get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations, args.parallel) elif args.perm_type == "mutations": deltas = get_deltas_for_mutations(args, infmat, infmat_index, heat_params) else: raise ValueError("Invalid mutation permutation type: %s" % args.perm_type) #find the multiple of the median delta s.t. the size of the largest CC in the real data #is <= MAX_CC_SIZE medianDelta = np.median(deltas[MIN_CC_SIZE]) M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys())) h = hn.heat_vec(heat, gene_index) sim = hn.similarity_matrix(M, h) for i in range(1, 11): G = hn.weighted_graph(sim, gene_index, i*medianDelta) max_cc_size = max([len(cc) for cc in hn.connected_components(G)]) if max_cc_size <= MAX_CC_SIZE: break #and recommend running HotNet with that multiple and the next 4 multiples recommended_deltas = [i*medianDelta for i in range(i, i+5)] output_file = open(args.output_file, 'w') if args.output_file else sys.stdout json.dump({"parameters": vars(args), "heat_parameters": heat_params, "recommended_deltas": recommended_deltas}, output_file, indent=4) if (args.output_file): output_file.close()
def run(args): # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") samples = hnio.load_samples(args.sample_file) if args.sample_file else None genes = hnio.load_genes(args.gene_file) if args.gene_file else None snvs = hnio.load_snvs(args.snv_file, genes, samples) if args.snv_file else [] cnas = hnio.load_cnas(args.cna_file, genes, samples) if args.cna_file else [] inactivating_snvs = hnio.load_inactivating_snvs(args.inactivating_snvs_file, genes, samples) \ if args.inactivating_snvs_file else [] fusions = hnio.load_fusions(args.fusions_file, genes, samples) if args.fusions_file else [] sample2type = hnio.load_sample_types(args.type_file) if args.type_file else None subnetworks = get_subnetworks(args.subnetworks_file) write_style_file(args.style_file, args.color_file, '%s/styles.json' % args.output_directory) for subnetwork in subnetworks: print "Generating mutation matrix for subnetwork %s" % subnetwork.index data = get_data_for_cc(subnetwork.genes, snvs, cnas, inactivating_snvs, fusions, sample2type) json.dump(data, open('%s/data.json' % args.output_directory, 'w'), indent=4) os.system('node drawMutationMatrix.js --json=%s/data.json --outdir=%s --width=%s --style=%s' % (args.output_directory, args.output_directory, args.width, '%s/styles.json' % args.output_directory)) os.rename('%s/mutation_matrix.svg' % args.output_directory, '%s/mutation_matrix_%s.svg' % (args.output_directory, subnetwork.index)) os.remove('%s/data.json' % args.output_directory) os.remove('%s/styles.json' % args.output_directory)
def heat_permutation_significance(args, heat, infmat, infmat_index, G): print("* Performing permuted heat statistical significance...") addtl_genes = hnio.load_genes( args.permutation_genes_file) if args.permutation_genes_file else None heat_permutations = permutations.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel) return calculate_significance(args, infmat, infmat_index, G, heat_permutations)
def load_mutation_heat(args): samples = hnio.load_samples(args.sample_file) if args.sample_file else None genes = hnio.load_genes(args.gene_file) if args.gene_file else None snvs = hnio.load_snvs(args.snv_file, genes, samples) cnas = hnio.load_cnas(args.cna_file, genes, samples) if args.cna_filter_threshold: cnas = hnheat.filter_cnas(cnas, args.cna_filter_threshold) if not samples: samples = set([mut.sample for mut in snvs + cnas]) heat = hnheat.mut_heat(len(samples), snvs, cnas, args.min_freq) return heat, None
def load_direct_heat(args): heat = hnio.load_heat_tsv(args.heat_file) heat, score_excluded_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score) filter_excluded_genes = [] if args.gene_filter_file: heat, filter_excluded_genes = hnheat.expr_filter_heat(heat, hnio.load_genes(args.gene_filter_file)) #ensure that all heat scores are positive bad_genes = [gene for gene in heat if heat[gene] < 0] if bad_genes: raise ValueError("ERROR: All gene heat scores must be non-negative. There are %s genes with\ negative heat scores: %s" % (len(bad_genes), bad_genes)) return heat, list(set(score_excluded_genes + filter_excluded_genes))
def run(args): infmat = scipy.io.loadmat(args.infmat_file)[args.infmat_name] infmat_index = hnio.load_index(args.infmat_index_file) heat, heat_params = hnio.load_heat_json(args.heat_file) if args.perm_type == "heat": addtl_genes = hnio.load_genes( args.permutation_genes_file ) if args.permutation_genes_file else None deltas = get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations, args.parallel) elif args.perm_type == "mutations": deltas = get_deltas_for_mutations(args, infmat, infmat_index, heat_params) else: raise ValueError("Invalid mutation permutation type: %s" % args.perm_type) #find the multiple of the median delta s.t. the size of the largest CC in the real data #is <= MAX_CC_SIZE medianDelta = np.median(deltas[MIN_CC_SIZE]) M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys())) h = hn.heat_vec(heat, gene_index) sim = hn.similarity_matrix(M, h) for i in range(1, 11): G = hn.weighted_graph(sim, gene_index, i * medianDelta) max_cc_size = max([len(cc) for cc in hn.connected_components(G)]) if max_cc_size <= MAX_CC_SIZE: break #and recommend running HotNet with that multiple and the next 4 multiples recommended_deltas = [i * medianDelta for i in range(i, i + 5)] output_file = open(args.output_file, 'w') if args.output_file else sys.stdout json.dump( { "parameters": vars(args), "heat_parameters": heat_params, "recommended_deltas": recommended_deltas }, output_file, indent=4) if (args.output_file): output_file.close()
def load_direct_heat(args): heat = hnio.load_heat_tsv(args.heat_file) heat, score_excluded_genes, args.min_heat_score = hnheat.filter_heat( heat, args.min_heat_score) filter_excluded_genes = [] if args.gene_filter_file: heat, filter_excluded_genes = hnheat.expr_filter_heat( heat, hnio.load_genes(args.gene_filter_file)) #ensure that all heat scores are positive bad_genes = [gene for gene in heat if heat[gene] < 0] if bad_genes: raise ValueError( "ERROR: All gene heat scores must be non-negative. There are %s genes with\ negative heat scores: %s" % (len(bad_genes), bad_genes)) return heat, list(set(score_excluded_genes + filter_excluded_genes))
def run(args): output_f = open(args.hotnet_output_json) output_blob = json.load(output_f) output_f.close() heat_parameters = output_blob["heat_parameters"] if heat_parameters["heat_fn"] != "load_mutation_heat": raise ValueError( "Heat scores must have been calculated from mutation data to annotate output." ) components = output_blob["components"] genes = hnio.load_genes(heat_parameters["gene_file"]) samples = hnio.load_samples(heat_parameters["sample_file"]) snvs = hnio.load_snvs(heat_parameters["snv_file"], genes, samples) cnas = hnio.load_cnas(heat_parameters["cna_file"], genes, samples) if not samples: samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas]) gene2mutsam = defaultdict(set) for mut in snvs + cnas: gene2mutsam[mut.gene].add(mut.sample) annotated_ccs = list() for component in components: annotated_cc = list() annotated_ccs.append(annotated_cc) cc_mutated_samples = set() for gene in component: cc_mutated_samples.update(gene2mutsam[gene]) annotated_cc.append("%s(%s)" % (gene, len(gene2mutsam[gene]))) annotated_cc.insert( 0, "%s(%s%%)" % (len(cc_mutated_samples), len(cc_mutated_samples) / float(len(samples)) * 100)) output_directory = output_blob["parameters"]["output_directory"] output_file = os.path.abspath(output_directory) + "/" + ANNOTATION_TSV hnio.write_components_as_tsv(output_file, annotated_ccs)
def run(args): output_f = open(args.hotnet_output_json) output_blob = json.load(output_f) output_f.close() heat_parameters = output_blob["heat_parameters"] if heat_parameters["heat_fn"] != "load_mutation_heat": raise ValueError("Heat scores must have been calculated from mutation data to annotate output.") components = output_blob["components"] genes = hnio.load_genes(heat_parameters["gene_file"]) samples = hnio.load_samples(heat_parameters["sample_file"]) snvs = hnio.load_snvs(heat_parameters["snv_file"], genes, samples) cnas = hnio.load_cnas(heat_parameters["cna_file"], genes, samples) if not samples: samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas]) gene2mutsam = defaultdict(set) for mut in snvs + cnas: gene2mutsam[mut.gene].add(mut.sample) annotated_ccs = list() for component in components: annotated_cc = list() annotated_ccs.append(annotated_cc) cc_mutated_samples = set() for gene in component: cc_mutated_samples.update(gene2mutsam[gene]) annotated_cc.append("%s(%s)" % (gene, len(gene2mutsam[gene]))) annotated_cc.insert(0, "%s(%s%%)" % (len(cc_mutated_samples), len(cc_mutated_samples) / float(len(samples)) * 100)) output_directory = output_blob["parameters"]["output_directory"] output_file = os.path.abspath(output_directory) + "/" + ANNOTATION_TSV hnio.write_components_as_tsv(output_file, annotated_ccs)
def generate_mutation_permutation_heat(heat_fn, sample_file, gene_file, genes_in_network, snv_file, gene_length_file, bmr, bmr_file, cna_file, gene_order_file, cna_filter_threshold, min_freq, num_permutations, num_cores=1): """Return a list of num_permutation dicts, each mapping gene names to heat scores calculated from permuted mutation data. Arguments: heat_fn -- the name of the function used to calculate heat scores for the real data. A RuntimeError is raised if this is not "load_mutation_heat" sample_file -- path to TSV file containing tested sample IDs as the first column. If None, the set of samples is assumed to be all samples that are provided in the SNV or CNA data. gene_file -- path to file containing names of tested genes, one per line. If None, the set of tested genes is assumed to be all genes that have mutations in either the SNV or CNA data. genes_in_network -- iterable of names of genes in the PPI network snv_file -- path to TSV file containing SNVs where the first column of each line is a sample ID and subsequent columns contain the names of SNVs with mutations in that sample. gene_length_file -- path to TSV file containing gene names in the first column and the length of the gene in base pairs in the second column bmr -- default background mutation rate bmr_file -- path to TSV file with gene names in the first column and the background mutation rate for the gene in the second column. The default BMR will be used for any gene without a BMR in this file. If None, the default BMR will be used for all genes. cna_file -- path to TSV file containing CNAs where the first column of each line is a sample ID and subsequent columns contain gene names followed by "(A)" or "(D)" indicating an ammplification or deletion in that gene for the sample. Lines starting with '#' will be ignored. gene_order_file -- path to file containing tab-separated lists of genes on each chromosme, one chromosome per line cna_filter_threshold -- proportion of CNAs in a gene across samples that must share the same CNA type in order for the CNAs to be included. Must be > .5 if not None. If None, all CNAs will be included. min_freq -- the minimum number of samples in which a gene must have an SNV to be considered mutated in the heat score calculation. num_permutations -- the number of permuted data sets to be generated num_cores -- number of cores to use for running in parallel """ if heat_fn != "load_mutation_heat": raise RuntimeError( "Heat scores must be based on mutation data to perform\ delta selection based on mutation data permutation." ) samples = hnio.load_samples(sample_file) if sample_file else None genes = hnio.load_genes(gene_file) if gene_file else None cnas = hnio.load_cnas(cna_file, genes, samples) gene2length = hnio.load_gene_lengths(gene_length_file) gene2bmr = hnio.load_gene_specific_bmrs(bmr_file) if bmr_file else {} gene2chromo, chromo2genes = hnio.load_gene_order(gene_order_file) if not samples: snvs = hnio.load_snvs(snv_file, genes, samples) samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas]) if not genes: genes = set([snv.gene for snv in snvs] + [cna.gene for cna in cnas]) #only generate mutations for genes that are in the network genes = set(genes).intersection(genes_in_network) if num_cores != 1: pool = mp.Pool(None if num_cores == -1 else num_cores) map_fn = pool.map else: map_fn = map args = [(samples, genes, cnas, gene2length, bmr, gene2bmr, gene2chromo, chromo2genes, cna_filter_threshold, min_freq)] * num_permutations heat_permutations = map_fn(mutation_permuation_heat_wrapper, args) if num_cores != 1: pool.close() pool.join() return heat_permutations
def generate_mutation_permutation_heat(heat_fn, sample_file, gene_file, genes_in_network, snv_file, gene_length_file, bmr, bmr_file, cna_file, gene_order_file, cna_filter_threshold, min_freq, num_permutations, parallel=True): """Return a list of num_permutation dicts, each mapping gene names to heat scores calculated from permuted mutation data. Arguments: heat_fn -- the name of the function used to calculate heat scores for the real data. A RuntimeError is raised if this is not "load_mutation_heat" sample_file -- path to TSV file containing tested sample IDs as the first column. If None, the set of samples is assumed to be all samples that are provided in the SNV or CNA data. gene_file -- path to file containing names of tested genes, one per line. If None, the set of tested genes is assumed to be all genes that have mutations in either the SNV or CNA data. genes_in_network -- iterable of names of genes in the PPI network snv_file -- path to TSV file containing SNVs where the first column of each line is a sample ID and subsequent columns contain the names of SNVs with mutations in that sample. gene_length_file -- path to TSV file containing gene names in the first column and the length of the gene in base pairs in the second column bmr -- default background mutation rate bmr_file -- path to TSV file with gene names in the first column and the background mutation rate for the gene in the second column. The default BMR will be used for any gene without a BMR in this file. If None, the default BMR will be used for all genes. cna_file -- path to TSV file containing CNAs where the first column of each line is a sample ID and subsequent columns contain gene names followed by "(A)" or "(D)" indicating an ammplification or deletion in that gene for the sample. Lines starting with '#' will be ignored. gene_order_file -- path to file containing tab-separated lists of genes on each chromosme, one chromosome per line cna_filter_threshold -- proportion of CNAs in a gene across samples that must share the same CNA type in order for the CNAs to be included. Must be > .5 if not None. If None, all CNAs will be included. min_freq -- the minimum number of samples in which a gene must have an SNV to be considered mutated in the heat score calculation. num_permutations -- the number of permuted data sets to be generated parallel -- whether to generate permuted data sets in parallel """ if heat_fn != "load_mutation_heat": raise RuntimeError("Heat scores must be based on mutation data to perform\ delta selection based on mutation data permutation.") samples = hnio.load_samples(sample_file) if sample_file else None genes = hnio.load_genes(gene_file) if gene_file else None cnas = hnio.load_cnas(cna_file, genes, samples) gene2length = hnio.load_gene_lengths(gene_length_file) gene2bmr = hnio.load_gene_specific_bmrs(bmr_file) if bmr_file else {} gene2chromo, chromo2genes = hnio.load_gene_order(gene_order_file) if not samples: snvs = hnio.load_snvs(snv_file, genes, samples) samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas]) if not genes: genes = set([snv.gene for snv in snvs] + [cna.gene for cna in cnas]) #only generate mutations for genes that are in the network genes = set(genes).intersection(genes_in_network) if parallel: pool = mp.Pool() map_fn = pool.map else: map_fn = map args = [(samples, genes, cnas, gene2length, bmr, gene2bmr, gene2chromo,chromo2genes, cna_filter_threshold, min_freq)] * num_permutations heat_permutations = map_fn(mutation_permuation_heat_wrapper, args) if parallel: pool.close() pool.join() return heat_permutations
def heat_permutation_significance(args, heat, infmat, infmat_index, G): print "* Performing permuted heat statistical significance..." addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None heat_permutations = permutations.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel) return calculate_significance(args, infmat, infmat_index, G, heat_permutations)