Example #1
0
def run(args):
    infmat = scipy.io.loadmat(args.infmat_file)[args.infmat_name]
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)
        
    if args.perm_type == "heat":
        addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None
        deltas = get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes,
                                         args.num_permutations, args.parallel)
    elif args.perm_type == "mutations":
        deltas = get_deltas_for_mutations(args, infmat, infmat_index, heat_params)
    else:
        raise ValueError("Invalid mutation permutation type: %s" % args.perm_type)
    
    #find the multiple of the median delta s.t. the size of the largest CC in the real data
    #is <= MAX_CC_SIZE
    medianDelta = np.median(deltas[MIN_CC_SIZE])
    M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()))
    h = hn.heat_vec(heat, gene_index)
    sim = hn.similarity_matrix(M, h)
    
    for i in range(1, 11):
        G = hn.weighted_graph(sim, gene_index, i*medianDelta)
        max_cc_size = max([len(cc) for cc in hn.connected_components(G)])
        if max_cc_size <= MAX_CC_SIZE:
            break
    
    #and recommend running HotNet with that multiple and the next 4 multiples
    recommended_deltas = [i*medianDelta for i in range(i, i+5)]

    output_file = open(args.output_file, 'w') if args.output_file else sys.stdout
    json.dump({"parameters": vars(args), "heat_parameters": heat_params,
               "recommended_deltas": recommended_deltas}, output_file, indent=4)
    if (args.output_file): output_file.close()
Example #2
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")

    samples = hnio.load_samples(args.sample_file) if args.sample_file else None
    genes = hnio.load_genes(args.gene_file) if args.gene_file else None
    snvs = hnio.load_snvs(args.snv_file, genes, samples) if args.snv_file else []
    cnas = hnio.load_cnas(args.cna_file, genes, samples) if args.cna_file else []
    inactivating_snvs = hnio.load_inactivating_snvs(args.inactivating_snvs_file, genes, samples) \
                            if args.inactivating_snvs_file else []
    fusions = hnio.load_fusions(args.fusions_file, genes, samples) if args.fusions_file else []
    sample2type = hnio.load_sample_types(args.type_file) if args.type_file else None
    subnetworks = get_subnetworks(args.subnetworks_file)

    write_style_file(args.style_file, args.color_file, '%s/styles.json' % args.output_directory)
    
    for subnetwork in subnetworks:
        print "Generating mutation matrix for subnetwork %s" % subnetwork.index
        data = get_data_for_cc(subnetwork.genes, snvs, cnas, inactivating_snvs, fusions, sample2type)
        json.dump(data, open('%s/data.json' % args.output_directory, 'w'), indent=4)

        os.system('node drawMutationMatrix.js --json=%s/data.json --outdir=%s --width=%s --style=%s' % 
            (args.output_directory, args.output_directory, args.width, '%s/styles.json' % args.output_directory))
        os.rename('%s/mutation_matrix.svg' % args.output_directory,
         		  '%s/mutation_matrix_%s.svg' % (args.output_directory, subnetwork.index))

    os.remove('%s/data.json' % args.output_directory)
    os.remove('%s/styles.json' % args.output_directory)
Example #3
0
def heat_permutation_significance(args, heat, infmat, infmat_index, G):
    print("* Performing permuted heat statistical significance...")

    addtl_genes = hnio.load_genes(
        args.permutation_genes_file) if args.permutation_genes_file else None
    heat_permutations = permutations.permute_heat(heat, args.num_permutations,
                                                  addtl_genes, args.parallel)
    return calculate_significance(args, infmat, infmat_index, G,
                                  heat_permutations)
Example #4
0
def load_mutation_heat(args):
    samples = hnio.load_samples(args.sample_file) if args.sample_file else None
    genes = hnio.load_genes(args.gene_file) if args.gene_file else None
    snvs = hnio.load_snvs(args.snv_file, genes, samples)
    cnas = hnio.load_cnas(args.cna_file, genes, samples)
    if args.cna_filter_threshold:
        cnas = hnheat.filter_cnas(cnas, args.cna_filter_threshold)

    if not samples:
        samples = set([mut.sample for mut in snvs + cnas])

    heat = hnheat.mut_heat(len(samples), snvs, cnas, args.min_freq)
    return heat, None
Example #5
0
def load_mutation_heat(args):
    samples = hnio.load_samples(args.sample_file) if args.sample_file else None
    genes = hnio.load_genes(args.gene_file) if args.gene_file else None
    snvs = hnio.load_snvs(args.snv_file, genes, samples)
    cnas = hnio.load_cnas(args.cna_file, genes, samples)
    if args.cna_filter_threshold:
        cnas = hnheat.filter_cnas(cnas, args.cna_filter_threshold)
    
    if not samples:
        samples = set([mut.sample for mut in snvs + cnas])
    
    heat = hnheat.mut_heat(len(samples), snvs, cnas, args.min_freq)
    return heat, None
Example #6
0
def load_direct_heat(args):
    heat = hnio.load_heat_tsv(args.heat_file)
    heat, score_excluded_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score)
    
    filter_excluded_genes = []
    if args.gene_filter_file:
        heat, filter_excluded_genes = hnheat.expr_filter_heat(heat,
                                                              hnio.load_genes(args.gene_filter_file))
    
    #ensure that all heat scores are positive
    bad_genes = [gene for gene in heat if heat[gene] < 0]
    if bad_genes:
        raise ValueError("ERROR: All gene heat scores must be non-negative. There are %s genes with\
                          negative heat scores: %s" % (len(bad_genes), bad_genes))
    
    return heat, list(set(score_excluded_genes + filter_excluded_genes))
Example #7
0
def run(args):
    infmat = scipy.io.loadmat(args.infmat_file)[args.infmat_name]
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)

    if args.perm_type == "heat":
        addtl_genes = hnio.load_genes(
            args.permutation_genes_file
        ) if args.permutation_genes_file else None
        deltas = get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes,
                                     args.num_permutations, args.parallel)
    elif args.perm_type == "mutations":
        deltas = get_deltas_for_mutations(args, infmat, infmat_index,
                                          heat_params)
    else:
        raise ValueError("Invalid mutation permutation type: %s" %
                         args.perm_type)

    #find the multiple of the median delta s.t. the size of the largest CC in the real data
    #is <= MAX_CC_SIZE
    medianDelta = np.median(deltas[MIN_CC_SIZE])
    M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()))
    h = hn.heat_vec(heat, gene_index)
    sim = hn.similarity_matrix(M, h)

    for i in range(1, 11):
        G = hn.weighted_graph(sim, gene_index, i * medianDelta)
        max_cc_size = max([len(cc) for cc in hn.connected_components(G)])
        if max_cc_size <= MAX_CC_SIZE:
            break

    #and recommend running HotNet with that multiple and the next 4 multiples
    recommended_deltas = [i * medianDelta for i in range(i, i + 5)]

    output_file = open(args.output_file,
                       'w') if args.output_file else sys.stdout
    json.dump(
        {
            "parameters": vars(args),
            "heat_parameters": heat_params,
            "recommended_deltas": recommended_deltas
        },
        output_file,
        indent=4)
    if (args.output_file):
        output_file.close()
Example #8
0
def load_direct_heat(args):
    heat = hnio.load_heat_tsv(args.heat_file)
    heat, score_excluded_genes, args.min_heat_score = hnheat.filter_heat(
        heat, args.min_heat_score)

    filter_excluded_genes = []
    if args.gene_filter_file:
        heat, filter_excluded_genes = hnheat.expr_filter_heat(
            heat, hnio.load_genes(args.gene_filter_file))

    #ensure that all heat scores are positive
    bad_genes = [gene for gene in heat if heat[gene] < 0]
    if bad_genes:
        raise ValueError(
            "ERROR: All gene heat scores must be non-negative. There are %s genes with\
                          negative heat scores: %s" %
            (len(bad_genes), bad_genes))

    return heat, list(set(score_excluded_genes + filter_excluded_genes))
Example #9
0
def run(args):
    output_f = open(args.hotnet_output_json)
    output_blob = json.load(output_f)
    output_f.close()

    heat_parameters = output_blob["heat_parameters"]

    if heat_parameters["heat_fn"] != "load_mutation_heat":
        raise ValueError(
            "Heat scores must have been calculated from mutation data to annotate output."
        )

    components = output_blob["components"]
    genes = hnio.load_genes(heat_parameters["gene_file"])
    samples = hnio.load_samples(heat_parameters["sample_file"])
    snvs = hnio.load_snvs(heat_parameters["snv_file"], genes, samples)
    cnas = hnio.load_cnas(heat_parameters["cna_file"], genes, samples)

    if not samples:
        samples = set([snv.sample
                       for snv in snvs] + [cna.sample for cna in cnas])

    gene2mutsam = defaultdict(set)
    for mut in snvs + cnas:
        gene2mutsam[mut.gene].add(mut.sample)

    annotated_ccs = list()
    for component in components:
        annotated_cc = list()
        annotated_ccs.append(annotated_cc)
        cc_mutated_samples = set()
        for gene in component:
            cc_mutated_samples.update(gene2mutsam[gene])
            annotated_cc.append("%s(%s)" % (gene, len(gene2mutsam[gene])))
        annotated_cc.insert(
            0,
            "%s(%s%%)" % (len(cc_mutated_samples),
                          len(cc_mutated_samples) / float(len(samples)) * 100))

    output_directory = output_blob["parameters"]["output_directory"]
    output_file = os.path.abspath(output_directory) + "/" + ANNOTATION_TSV
    hnio.write_components_as_tsv(output_file, annotated_ccs)
Example #10
0
def run(args):
    output_f = open(args.hotnet_output_json)
    output_blob = json.load(output_f)
    output_f.close()
    
    heat_parameters = output_blob["heat_parameters"]
    
    if heat_parameters["heat_fn"] != "load_mutation_heat":
        raise ValueError("Heat scores must have been calculated from mutation data to annotate output.")
    
    components = output_blob["components"]
    genes = hnio.load_genes(heat_parameters["gene_file"])
    samples = hnio.load_samples(heat_parameters["sample_file"])
    snvs = hnio.load_snvs(heat_parameters["snv_file"], genes, samples)
    cnas = hnio.load_cnas(heat_parameters["cna_file"], genes, samples)
    
    if not samples:
        samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas])
    
    gene2mutsam = defaultdict(set)
    for mut in snvs + cnas:
        gene2mutsam[mut.gene].add(mut.sample)
    
    annotated_ccs = list()
    for component in components:
        annotated_cc = list()
        annotated_ccs.append(annotated_cc)
        cc_mutated_samples = set()
        for gene in component:
            cc_mutated_samples.update(gene2mutsam[gene])
            annotated_cc.append("%s(%s)" % (gene, len(gene2mutsam[gene])))
        annotated_cc.insert(0, "%s(%s%%)" % (len(cc_mutated_samples),
                                             len(cc_mutated_samples) / float(len(samples)) * 100))
    
    output_directory = output_blob["parameters"]["output_directory"]
    output_file = os.path.abspath(output_directory) + "/" + ANNOTATION_TSV
    hnio.write_components_as_tsv(output_file, annotated_ccs)
Example #11
0
def generate_mutation_permutation_heat(heat_fn,
                                       sample_file,
                                       gene_file,
                                       genes_in_network,
                                       snv_file,
                                       gene_length_file,
                                       bmr,
                                       bmr_file,
                                       cna_file,
                                       gene_order_file,
                                       cna_filter_threshold,
                                       min_freq,
                                       num_permutations,
                                       num_cores=1):
    """Return a list of num_permutation dicts, each mapping gene names to heat scores calculated
    from permuted mutation data.
    
    Arguments:
    heat_fn -- the name of the function used to calculate heat scores for the real data.
               A RuntimeError is raised if this is not "load_mutation_heat"
    sample_file -- path to TSV file containing tested sample IDs as the first column. If None, the
                   set of samples is assumed to be all samples that are provided in the SNV or CNA data.
    gene_file -- path to file containing names of tested genes, one per line. If None, the set of
                 tested genes is assumed to be all genes that have mutations in either the SNV or
                 CNA data.
    genes_in_network -- iterable of names of genes in the PPI network
    snv_file -- path to TSV file containing SNVs where the first column of each line is a sample ID
                and subsequent columns contain the names of SNVs with mutations in that sample.
    gene_length_file -- path to TSV file containing gene names in the first column and the length
                        of the gene in base pairs in the second column
    bmr -- default background mutation rate
    bmr_file -- path to TSV file with gene names in the first column and the background mutation rate
                for the gene in the second column. The default BMR will be used for any gene without
                a BMR in this file. If None, the default BMR will be used for all genes.
    cna_file -- path to TSV file containing CNAs where the first column of each line is a sample ID
                and subsequent columns contain gene names followed by "(A)" or "(D)" indicating an
                ammplification or deletion in that gene for the sample. Lines starting with '#'
                will be ignored.
    gene_order_file -- path to file containing tab-separated lists of genes on each chromosme,
                       one chromosome per line
    cna_filter_threshold -- proportion of CNAs in a gene across samples that must share  the same
                            CNA type in order for the CNAs to be included. Must be > .5 if not None.
                            If None, all CNAs will be included.
    min_freq -- the minimum number of samples in which a gene must have an SNV to be considered
                mutated in the heat score calculation.
    num_permutations -- the number of permuted data sets to be generated
    num_cores -- number of cores to use for running in parallel
    
    """
    if heat_fn != "load_mutation_heat":
        raise RuntimeError(
            "Heat scores must be based on mutation data to perform\
                            delta selection based on mutation data permutation."
        )

    samples = hnio.load_samples(sample_file) if sample_file else None
    genes = hnio.load_genes(gene_file) if gene_file else None
    cnas = hnio.load_cnas(cna_file, genes, samples)
    gene2length = hnio.load_gene_lengths(gene_length_file)
    gene2bmr = hnio.load_gene_specific_bmrs(bmr_file) if bmr_file else {}
    gene2chromo, chromo2genes = hnio.load_gene_order(gene_order_file)

    if not samples:
        snvs = hnio.load_snvs(snv_file, genes, samples)
        samples = set([snv.sample
                       for snv in snvs] + [cna.sample for cna in cnas])
    if not genes:
        genes = set([snv.gene for snv in snvs] + [cna.gene for cna in cnas])

    #only generate mutations for genes that are in the network
    genes = set(genes).intersection(genes_in_network)

    if num_cores != 1:
        pool = mp.Pool(None if num_cores == -1 else num_cores)
        map_fn = pool.map
    else:
        map_fn = map

    args = [(samples, genes, cnas, gene2length, bmr, gene2bmr, gene2chromo,
             chromo2genes, cna_filter_threshold, min_freq)] * num_permutations
    heat_permutations = map_fn(mutation_permuation_heat_wrapper, args)

    if num_cores != 1:
        pool.close()
        pool.join()

    return heat_permutations
Example #12
0
def generate_mutation_permutation_heat(heat_fn, sample_file, gene_file, genes_in_network, snv_file,
                                       gene_length_file, bmr, bmr_file, cna_file, gene_order_file,
                                       cna_filter_threshold, min_freq, num_permutations, parallel=True):
    """Return a list of num_permutation dicts, each mapping gene names to heat scores calculated
    from permuted mutation data.
    
    Arguments:
    heat_fn -- the name of the function used to calculate heat scores for the real data.
               A RuntimeError is raised if this is not "load_mutation_heat"
    sample_file -- path to TSV file containing tested sample IDs as the first column. If None, the
                   set of samples is assumed to be all samples that are provided in the SNV or CNA data.
    gene_file -- path to file containing names of tested genes, one per line. If None, the set of
                 tested genes is assumed to be all genes that have mutations in either the SNV or
                 CNA data.
    genes_in_network -- iterable of names of genes in the PPI network
    snv_file -- path to TSV file containing SNVs where the first column of each line is a sample ID
                and subsequent columns contain the names of SNVs with mutations in that sample.
    gene_length_file -- path to TSV file containing gene names in the first column and the length
                        of the gene in base pairs in the second column
    bmr -- default background mutation rate
    bmr_file -- path to TSV file with gene names in the first column and the background mutation rate
                for the gene in the second column. The default BMR will be used for any gene without
                a BMR in this file. If None, the default BMR will be used for all genes.
    cna_file -- path to TSV file containing CNAs where the first column of each line is a sample ID
                and subsequent columns contain gene names followed by "(A)" or "(D)" indicating an
                ammplification or deletion in that gene for the sample. Lines starting with '#'
                will be ignored.
    gene_order_file -- path to file containing tab-separated lists of genes on each chromosme,
                       one chromosome per line
    cna_filter_threshold -- proportion of CNAs in a gene across samples that must share  the same
                            CNA type in order for the CNAs to be included. Must be > .5 if not None.
                            If None, all CNAs will be included.
    min_freq -- the minimum number of samples in which a gene must have an SNV to be considered
                mutated in the heat score calculation.
    num_permutations -- the number of permuted data sets to be generated
    parallel -- whether to generate permuted data sets in parallel
    
    """
    if heat_fn != "load_mutation_heat":
        raise RuntimeError("Heat scores must be based on mutation data to perform\
                            delta selection based on mutation data permutation.")
    
    samples = hnio.load_samples(sample_file) if sample_file else None
    genes = hnio.load_genes(gene_file) if gene_file else None
    cnas = hnio.load_cnas(cna_file, genes, samples)
    gene2length = hnio.load_gene_lengths(gene_length_file)
    gene2bmr = hnio.load_gene_specific_bmrs(bmr_file) if bmr_file else {}
    gene2chromo, chromo2genes = hnio.load_gene_order(gene_order_file)
    
    if not samples:
        snvs = hnio.load_snvs(snv_file, genes, samples)
        samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas])
    if not genes:
        genes = set([snv.gene for snv in snvs] + [cna.gene for cna in cnas])
    
    #only generate mutations for genes that are in the network
    genes = set(genes).intersection(genes_in_network)
    
    if parallel:
        pool = mp.Pool()
        map_fn = pool.map
    else:
        map_fn = map
    
    args = [(samples, genes, cnas, gene2length, bmr, gene2bmr, gene2chromo,chromo2genes,
             cna_filter_threshold, min_freq)] * num_permutations
    heat_permutations = map_fn(mutation_permuation_heat_wrapper, args)
    
    if parallel:
        pool.close()
        pool.join()
    
    return heat_permutations
Example #13
0
def heat_permutation_significance(args, heat, infmat, infmat_index, G):
    print "* Performing permuted heat statistical significance..."
    
    addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None
    heat_permutations = permutations.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel)
    return calculate_significance(args, infmat, infmat_index, G, heat_permutations)