Example #1
0
def get_deltas_for_heat(infmat, index2gene, gene2heat, addtl_genes, num_permutations, test_statistic,
                        sizes, classic, num_cores):
    print "* Performing permuted heat delta selection..."
    heat_permutations = permutations.permute_heat(gene2heat, index2gene.values(), num_permutations,
                                                  addtl_genes, num_cores)
    return get_deltas_from_heat_permutations(infmat, index2gene, heat_permutations, test_statistic,
                                             sizes, classic, num_cores)
Example #2
0
def get_deltas_for_heat(
    infmat, index2gene, gene2heat, addtl_genes, num_permutations, test_statistic, sizes, classic, num_cores
):
    print "* Performing permuted heat delta selection..."
    heat_permutations = permutations.permute_heat(
        gene2heat, index2gene.values(), num_permutations, addtl_genes, num_cores
    )
    return get_deltas_from_heat_permutations(
        infmat, index2gene, heat_permutations, test_statistic, sizes, classic, num_cores
    )
Example #3
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print(
            "WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
            "(Ctrl-c to cancel).")

    # load data
    infmat = hnio.load_infmat(args.infmat_file, args.infmat_name)
    full_index2gene = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)

    # compute similarity matrix
    sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat,
                                           not args.classic)

    # only calculate permuted data sets for significance testing once
    if args.permutation_type != "none":
        if args.permutation_type == "heat":
            print "* Generating heat permutations for statistical significance testing"
            extra_genes = hnio.load_genes(args.permutation_genes_file) \
                            if args.permutation_genes_file else None
            heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                               args.num_permutations,
                                               extra_genes, args.num_cores)
        elif args.permutation_type == "mutations":
            if heat_params["heat_fn"] != "load_mutation_heat":
                raise RuntimeError(
                    "Heat scores must be based on mutation data to perform\
                                        significance testing based on mutation data permutation."
                )
            print "* Generating mutation permutations for statistical significance testing"
            heat_permutations = p.generate_mutation_permutation_heat(
                heat_params["heat_fn"],
                heat_params["sample_file"], heat_params["gene_file"],
                full_index2gene.values(), heat_params["snv_file"],
                args.gene_length_file, args.bmr, args.bmr_file,
                heat_params["cna_file"], args.gene_order_file,
                heat_params["cna_filter_threshold"], heat_params["min_freq"],
                args.num_permutations, args.num_cores)
        elif args.permutation_type == "network":
            pass  #nothing to do right now
        elif args.permutation_type == "precomputed":
            heat_file_paths = [
                args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i))
                for i in range(1, args.num_permutations + 1)
            ]
            heat_permutations = [
                hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths
            ]
        else:
            raise ValueError("Unrecognized permutation type %s" %
                             (args.permutation_type))

    for delta in args.deltas:
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)

        G = hn.weighted_graph(sim, index2gene, delta, not args.classic)
        ccs = hn.connected_components(G, args.min_cc_size)

        # calculate significance
        if args.permutation_type != "none":
            if args.permutation_type == "network":
                sizes2stats = calculate_significance_network(
                    args, args.permuted_networks_path, full_index2gene, G,
                    heat, delta, args.num_permutations)
            else:
                sizes2stats = calculate_significance(args, infmat,
                                                     full_index2gene, G, delta,
                                                     heat_permutations)

        #sort ccs list such that genes within components are sorted alphanumerically, and components
        #are sorted first by length, then alphanumerically by name of the first gene in the component
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)

        #write output
        hnio.write_components_as_tsv(
            os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
        args.delta = delta  # include delta in parameters section of output JSON
        output_dict = {
            "parameters": vars(args),
            "heat_parameters": heat_params,
            "sizes": hn.component_sizes(ccs),
            "components": ccs
        }
        if args.permutation_type != "none":
            output_dict["statistics"] = sizes2stats
            hnio.write_significance_as_tsv(
                os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                sizes2stats)

        json_out = open(
            os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
Example #4
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")
    
    # load data
    infmat = np.array(scipy.io.loadmat(args.infmat_file)[args.infmat_name])
    full_index2gene = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)
  
    # compute similarity matrix
    sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, not args.classic)
    
    # only calculate permuted data sets for significance testing once
    if args.permutation_type != "none":
        if args.permutation_type == "heat":
            print "* Generating heat permutations for statistical significance testing" 
            extra_genes = hnio.load_genes(args.permutation_genes_file) \
                            if args.permutation_genes_file else None
            heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                               args.num_permutations, extra_genes, args.num_cores)
        elif args.permutation_type == "mutations":
            if heat_params["heat_fn"] != "load_mutation_heat":
                    raise RuntimeError("Heat scores must be based on mutation data to perform\
                                        significance testing based on mutation data permutation.")
            print "* Generating mutation permutations for statistical significance testing"
            heat_permutations = p.generate_mutation_permutation_heat(
                                    heat_params["heat_fn"], heat_params["sample_file"],
                                    heat_params["gene_file"], full_index2gene.values(),
                                    heat_params["snv_file"], args.gene_length_file, args.bmr,
                                    args.bmr_file, heat_params["cna_file"], args.gene_order_file,
                                    heat_params["cna_filter_threshold"], heat_params["min_freq"],
                                    args.num_permutations, args.num_cores)
        elif args.permutation_type == "network":
            pass    #nothing to do right now
        elif args.permutation_type == "precomputed":
            heat_file_paths = [args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i))
                               for i in range(1, args.num_permutations+1)]
            heat_permutations = [hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths]
        else:
            raise ValueError("Unrecognized permutation type %s" % (args.permutation_type))
    
    for delta in args.deltas:
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        G = hn.weighted_graph(sim, index2gene, delta, not args.classic)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance
        if args.permutation_type != "none":
            if args.permutation_type == "network":
                sizes2stats = calculate_significance_network(args, args.permuted_networks_path,
                                                             full_index2gene, G, heat, delta,
                                                             args.num_permutations)
            else:
                sizes2stats = calculate_significance(args, infmat, full_index2gene, G, delta,
                                                     heat_permutations)
        
        #sort ccs list such that genes within components are sorted alphanumerically, and components
        #are sorted first by length, then alphanumerically by name of the first gene in the component 
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)
        
        #write output
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
        args.delta = delta  # include delta in parameters section of output JSON
        output_dict = {"parameters": vars(args), "heat_parameters": heat_params,
                       "sizes": hn.component_sizes(ccs), "components": ccs}
        if args.permutation_type != "none":
            output_dict["statistics"] = sizes2stats
            hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                           sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
Example #5
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")

    infmat = scipy.io.loadmat(args.infmat_file)[INFMAT_NAME]
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat = hnio.load_heat_tsv(args.heat_file)
    
    # filter out genes with heat score less than min_heat_score
    heat, addtl_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score)

    # find smallest delta 
    deltas = ft.get_deltas_for_network(args.permuted_networks_path, heat, INFMAT_NAME,
                                       infmat_index, MAX_CC_SIZES, 
				                       args.num_permutations, args.parallel)
    
    # and run HotNet with the median delta for each size
    run_deltas = [np.median(deltas[size]) for size in deltas]
    M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()))
    h = hn.heat_vec(heat, gene_index)
    sim = hn.similarity_matrix(M, h)

    # load interaction network edges and determine location of static HTML files for visualization
    edges = hnio.load_ppi_edges(args.edge_file) if args.edge_file else None
    index_file = '%s/viz_files/%s' % (hotnet2.__file__.rsplit('/', 1)[0], VIZ_INDEX)
    subnetworks_file = '%s/viz_files/%s' % (hotnet2.__file__.rsplit('/', 1)[0], VIZ_SUBNETWORKS)
    gene2index = dict([(gene, index) for index, gene in infmat_index.iteritems()])

    for delta in run_deltas: 
        # create output directory
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        # find connected components
        G = hn.weighted_graph(sim, gene_index, delta)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance (using all genes with heat scores)
        print "* Performing permuted heat statistical significance..."
        heat_permutations = p.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel)
        sizes = range(2, 11)
        print "\t- Using no. of components >= k (k \\in",
        print "[%s, %s]) as statistic" % (min(sizes), max(sizes))
        sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations,
                                                          delta, sizes, args.parallel)
        real_counts = stats.num_components_min_size(G, sizes)
        size2real_counts = dict(zip(sizes, real_counts))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations)
    
        # sort ccs list such that genes within components are sorted alphanumerically, and components
        # are sorted first by length, then alphanumerically by name of the first gene in the component
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)
            
        # write output
        heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}}
        heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w')
        json.dump(heat_dict, heat_out, indent=4)
        heat_out.close()
        
        args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON
        args.delta = delta
        output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs),
                       "components": ccs, "statistics": sizes2stats}
        hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                       sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
        
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
        
        # write visualization output if edge file given
        if args.edge_file:
            viz_data = {"delta": delta, 'subnetworks': list()}
            for cc in ccs:
                viz_data['subnetworks'].append(viz.get_component_json(cc, heat, edges, gene2index,
                                                                      args.network_name))
                
            delta_viz_dir = '%s/viz/delta%s' % (args.output_directory, delta)
            if not os.path.isdir(delta_viz_dir):
                os.makedirs(delta_viz_dir)
            viz_out = open('%s/subnetworks.json' % delta_viz_dir, 'w')
            json.dump(viz_data, viz_out, indent=4)
            viz_out.close()
    
            shutil.copy(subnetworks_file, delta_viz_dir)
    
    if args.edge_file:
        viz.write_index_file(index_file, '%s/viz/%s' % (args.output_directory, VIZ_INDEX), run_deltas)
Example #6
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")
    
    # load data
    infmat = scipy.io.loadmat(args.infmat_file)[args.infmat_name]  
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)
  
    # compute similarity matrix and extract connected components
    M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()))
    h = hn.heat_vec(heat, gene_index)
    sim = hn.similarity_matrix(M, h)
    
    # only calculate permuted data sets for significance testing once
    if args.permutation_type != "none":
        if args.permutation_type == "heat":
            print "* Generating heat permutations for statistical significance testing" 
            extra_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file \
                            else None
            heat_permutations = permutations.permute_heat(heat, args.num_permutations, extra_genes,
                                                          args.parallel)
        elif args.permutation_type == "precomputed":
            heat_file_paths = [args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i))
                               for i in range(1, args.num_permutations+1)]
            heat_permutations = [hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths]
        else:
            raise ValueError("Unrecognized permutation type %s" % (args.permutation_type))
    
    for delta in args.deltas:
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        G = hn.weighted_graph(sim, gene_index, delta)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance
        if args.permutation_type != "none":
            sizes2stats = calculate_significance(args, infmat, infmat_index, G, delta, heat_permutations)
        
        #sort ccs list such that genes within components are sorted alphanumerically, and components
        #are sorted first by length, then alphanumerically by name of the first gene in the component 
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)
        
        #write output
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
        args.delta = delta
        output_dict = {"parameters": vars(args), "heat_parameters": heat_params,
                       "sizes": hn.component_sizes(ccs), "components": ccs}
        if args.permutation_type != "none":
            output_dict["statistics"] = sizes2stats
            hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                           sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()