Beispiel #1
0
def run(args):
    #if l not specified, set default based on test statistic 
    if not args.sizes:
        args.sizes = [5,10,15,20] if args.test_statistic == MAX_CC_SIZE else [3]
    
    #disallow finding delta by # of CCs of size >= l for HotNet2, since this is not currently
    #implemented correctly (and is non-trivial to implement)
    if not args.classic and args.test_statistic != MAX_CC_SIZE:
        raise ValueError("For HotNet2, the largest CC size test statistic must be used.")
    
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)

    if args.perm_type == "heat":
        infmat = hnio.load_infmat(args.infmat_file, args.infmat_name)
        addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None
        deltas = get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations,
                                     args.test_statistic, args.sizes, args.classic, args.num_cores)
    elif args.perm_type == "mutations":
        infmat = hnio.load_infmat(args.infmat_file, args.infmat_name)
        deltas = get_deltas_for_mutations(args, infmat, infmat_index, heat_params)
    elif args.perm_type == "network":
        deltas = get_deltas_for_network(args.permuted_networks_path, heat, args.infmat_name,
                                         infmat_index, args.test_statistic, args.sizes,
                                         args.classic, args.num_permutations, args.num_cores)
    else:
        raise ValueError("Invalid mutation permutation type: %s" % args.perm_type)
    
    output_file = open(args.output_file, 'w') if args.output_file else sys.stdout
    json.dump({"parameters": vars(args), "heat_parameters": heat_params,
               "deltas": deltas}, output_file, indent=4)
    if (args.output_file): output_file.close()
Beispiel #2
0
def run(args):
    heat = args.heat_fn(args)
    if args.heat_fn != load_mutation_heat and args.gene_filter_file:
        heat = hnheat.reconcile_heat_with_tested_genes(heat, hnio.load_genes(args.gene_filter_file))

    args.heat_fn = args.heat_fn.__name__
    output_dict = {"parameters": vars(args), "heat": heat}

    output_file = open(args.output_file, 'w') if args.output_file else sys.stdout
    json.dump(output_dict, output_file, indent=4)
    if (args.output_file): output_file.close()
def load_mutation_heat(args):
    samples = hnio.load_samples(args.sample_file) if args.sample_file else None
    genes = hnio.load_genes(args.gene_file) if args.gene_file else None
    snvs = hnio.load_snvs(args.snv_file, genes, samples)
    cnas = hnio.load_cnas(args.cna_file, genes, samples) if args.cna_file else []
    if args.cna_filter_threshold:
        cnas = hnheat.filter_cnas(cnas, args.cna_filter_threshold)
    
    if not samples:
        samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas])
    return hnheat.mut_heat(len(samples), snvs, cnas, args.min_freq), None
Beispiel #4
0
def run(args):
    heat = args.heat_fn(args)
    if args.heat_fn != load_mutation_heat and args.gene_filter_file:
        heat = hnheat.reconcile_heat_with_tested_genes(
            heat, hnio.load_genes(args.gene_filter_file))

    args.heat_fn = args.heat_fn.__name__
    output_dict = {"parameters": vars(args), "heat": heat}

    output_file = open(args.output_file,
                       'w') if args.output_file else sys.stdout
    json.dump(output_dict, output_file, indent=4)
    if (args.output_file): output_file.close()
Beispiel #5
0
def load_mutation_heat(args):
    genes = hnio.load_genes(args.gene_file) if args.gene_file else None
    samples = hnio.load_samples(args.sample_file) if args.sample_file else None
    snvs = hnio.load_snvs(args.snv_file, genes, samples)
    cnas = hnio.load_cnas(args.cna_file, genes,
                          samples) if args.cna_file else []
    if args.cna_filter_threshold:
        cnas = hnheat.filter_cnas(cnas, args.cna_filter_threshold)
    if not samples:
        samples = set([snv.sample
                       for snv in snvs] + [cna.sample for cna in cnas])
    if not genes:
        genes = set([snv.gene for snv in snvs] + [cna.gene for cna in cnas])
    return hnheat.mut_heat(genes, len(samples), snvs, cnas, args.min_freq)
Beispiel #6
0
def run(args):
    # if l not specified, set default based on test statistic
    if not args.sizes:
        args.sizes = [5, 10, 15, 20] if args.test_statistic == MAX_CC_SIZE else [3]

    # disallow finding delta by # of CCs of size >= l for HotNet2, since this is not currently
    # implemented correctly (and is non-trivial to implement)
    if not args.classic and args.test_statistic != MAX_CC_SIZE:
        raise ValueError("For HotNet2, the largest CC size test statistic must be used.")

    infmat_index = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)

    if args.perm_type == "heat":
        infmat = hnio.load_infmat(args.infmat_file, args.infmat_name)
        addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None
        deltas = get_deltas_for_heat(
            infmat,
            infmat_index,
            heat,
            addtl_genes,
            args.num_permutations,
            args.test_statistic,
            args.sizes,
            args.classic,
            args.num_cores,
        )
    elif args.perm_type == "mutations":
        infmat = hnio.load_infmat(args.infmat_file, args.infmat_name)
        deltas = get_deltas_for_mutations(args, infmat, infmat_index, heat_params)
    elif args.perm_type == "network":
        deltas = get_deltas_for_network(
            args.permuted_networks_path,
            heat,
            args.infmat_name,
            infmat_index,
            args.test_statistic,
            args.sizes,
            args.classic,
            args.num_permutations,
            args.num_cores,
        )
    else:
        raise ValueError("Invalid mutation permutation type: %s" % args.perm_type)

    output_file = open(args.output_file, "w") if args.output_file else sys.stdout
    json.dump({"parameters": vars(args), "heat_parameters": heat_params, "deltas": deltas}, output_file, indent=4)
    if args.output_file:
        output_file.close()
def run(args):
    heat, heat_excluded_genes = args.heat_fn(args)
    
    filter_excluded_genes = []
    if args.heat_fn != load_mutation_heat and args.gene_filter_file:
        heat, filter_excluded_genes = hnheat.expr_filter_heat(heat,
                                                              hnio.load_genes(args.gene_filter_file))
    
    args.heat_fn = args.heat_fn.__name__
    output_dict = {"parameters": vars(args), "heat": heat}
    
    if args.heat_fn == "load_direct_heat":
        output_dict["excluded_genes"] = list(set().union(heat_excluded_genes, filter_excluded_genes))
        if args.excluded_genes_output_file:
            hnio.write_gene_list(args.excluded_genes_output_file, heat_excluded_genes)    
    
    output_file = open(args.output_file, 'w') if args.output_file else sys.stdout
    json.dump(output_dict, output_file, indent=4)
    if (args.output_file): output_file.close()
Beispiel #8
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print(
            "WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
            "(Ctrl-c to cancel).")

    # load data
    infmat = hnio.load_infmat(args.infmat_file, args.infmat_name)
    full_index2gene = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)

    # compute similarity matrix
    sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat,
                                           not args.classic)

    # only calculate permuted data sets for significance testing once
    if args.permutation_type != "none":
        if args.permutation_type == "heat":
            print "* Generating heat permutations for statistical significance testing"
            extra_genes = hnio.load_genes(args.permutation_genes_file) \
                            if args.permutation_genes_file else None
            heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                               args.num_permutations,
                                               extra_genes, args.num_cores)
        elif args.permutation_type == "mutations":
            if heat_params["heat_fn"] != "load_mutation_heat":
                raise RuntimeError(
                    "Heat scores must be based on mutation data to perform\
                                        significance testing based on mutation data permutation."
                )
            print "* Generating mutation permutations for statistical significance testing"
            heat_permutations = p.generate_mutation_permutation_heat(
                heat_params["heat_fn"],
                heat_params["sample_file"], heat_params["gene_file"],
                full_index2gene.values(), heat_params["snv_file"],
                args.gene_length_file, args.bmr, args.bmr_file,
                heat_params["cna_file"], args.gene_order_file,
                heat_params["cna_filter_threshold"], heat_params["min_freq"],
                args.num_permutations, args.num_cores)
        elif args.permutation_type == "network":
            pass  #nothing to do right now
        elif args.permutation_type == "precomputed":
            heat_file_paths = [
                args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i))
                for i in range(1, args.num_permutations + 1)
            ]
            heat_permutations = [
                hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths
            ]
        else:
            raise ValueError("Unrecognized permutation type %s" %
                             (args.permutation_type))

    for delta in args.deltas:
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)

        G = hn.weighted_graph(sim, index2gene, delta, not args.classic)
        ccs = hn.connected_components(G, args.min_cc_size)

        # calculate significance
        if args.permutation_type != "none":
            if args.permutation_type == "network":
                sizes2stats = calculate_significance_network(
                    args, args.permuted_networks_path, full_index2gene, G,
                    heat, delta, args.num_permutations)
            else:
                sizes2stats = calculate_significance(args, infmat,
                                                     full_index2gene, G, delta,
                                                     heat_permutations)

        #sort ccs list such that genes within components are sorted alphanumerically, and components
        #are sorted first by length, then alphanumerically by name of the first gene in the component
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)

        #write output
        hnio.write_components_as_tsv(
            os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
        args.delta = delta  # include delta in parameters section of output JSON
        output_dict = {
            "parameters": vars(args),
            "heat_parameters": heat_params,
            "sizes": hn.component_sizes(ccs),
            "components": ccs
        }
        if args.permutation_type != "none":
            output_dict["statistics"] = sizes2stats
            hnio.write_significance_as_tsv(
                os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                sizes2stats)

        json_out = open(
            os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
Beispiel #9
0
def run(args):
    subnetworks_file = '%s/viz_files/%s' % (str(hotnet2.__file__).rsplit('/', 1)[0], VIZ_SUBNETWORKS)

    # create output directory if doesn't exist; warn if it exists and is not empty
    outdir = args.output_directory
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    if len(os.listdir(outdir)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")

    ks = set()
    output = dict(deltas=[], subnetworks=dict(), mutation_matrices=dict(), stats=dict())
    subnetworks = dict()
    for results_file in args.results_files:
        results = json.load(open(results_file))
        ccs = results['components']

        heat_file = json.load(open(results['parameters']['heat_file']))
        gene2heat = heat_file['heat']
        heat_parameters = heat_file['parameters']
        d_score = hnio.load_display_score_tsv(args.display_score_file) if args.display_score_file else None
        d_name = hnio.load_display_name_tsv(args.display_name_file) if args.display_name_file else dict()
        edges = hnio.load_ppi_edges(args.edge_file, hnio.load_index(results['parameters']['infmat_index_file']))
        delta = format(results['parameters']['delta'], 'g')
        output['deltas'].append(delta)
        subnetworks[delta] = ccs

        output["subnetworks"][delta] = []
        for cc in ccs:
            output['subnetworks'][delta].append(viz.get_component_json(cc, gene2heat, edges,
                                                                args.network_name, d_score, d_name))
            
        # make oncoprints if heat file was generated from mutation data
        if 'heat_fn' in heat_parameters and heat_parameters['heat_fn'] == 'load_mutation_heat':
            output['mutation_matrices'][delta] = list()
            samples = hnio.load_samples(heat_parameters['sample_file']) if heat_parameters['sample_file'] else None
            genes = hnio.load_genes(heat_parameters['gene_file']) if heat_parameters['gene_file'] else None
            snvs = hnio.load_snvs(heat_parameters['snv_file'], genes, samples) if heat_parameters['snv_file'] else []
            cnas = hnio.load_cnas(heat_parameters['cna_file'], genes, samples) if heat_parameters['cna_file'] else []

            for cc in ccs:
                output['mutation_matrices'][delta].append(viz.get_oncoprint_json(cc, snvs, cnas, d_name))

            if heat_parameters.get('sample_type_file'):
                with open(heat_parameters['sample_type_file']) as f:
                    output['sampleToTypes'] = dict(l.rstrip().split() for l in f if not l.startswith("#") )
                    output['typeToSamples'] = dict((t, []) for t in set(output['sampleToTypes'].values()))
                    for s, ty in output['sampleToTypes'].iteritems():
                        output['typeToSamples'][ty].append( s )
            else:
                output['sampleToTypes'] = dict( (s, "Cancer") for s in samples )
                output['typeToSamples'] = dict(Cancer=list(samples))

        output['stats'][delta] = results['statistics']
        for k in sorted(map(int, results['statistics'].keys())):
            ks.add(k)
            continue
            stats = results['statistics'][str(k)]
            output['stats'][delta].append( dict(k=k, expected=stats['expected'], observed=stats['observed'], pval=stats['pval']))

    output['ks'] = range(min(ks), max(ks)+1)
    with open('%s/subnetworks.json' % outdir, 'w') as out:
        json.dump(output, out, indent=4)

    shutil.copy(subnetworks_file, '%s/%s' % (outdir, VIZ_INDEX))
Beispiel #10
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")
    
    # load data
    infmat = np.array(scipy.io.loadmat(args.infmat_file)[args.infmat_name])
    full_index2gene = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)
  
    # compute similarity matrix
    sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, not args.classic)
    
    # only calculate permuted data sets for significance testing once
    if args.permutation_type != "none":
        if args.permutation_type == "heat":
            print "* Generating heat permutations for statistical significance testing" 
            extra_genes = hnio.load_genes(args.permutation_genes_file) \
                            if args.permutation_genes_file else None
            heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                               args.num_permutations, extra_genes, args.num_cores)
        elif args.permutation_type == "mutations":
            if heat_params["heat_fn"] != "load_mutation_heat":
                    raise RuntimeError("Heat scores must be based on mutation data to perform\
                                        significance testing based on mutation data permutation.")
            print "* Generating mutation permutations for statistical significance testing"
            heat_permutations = p.generate_mutation_permutation_heat(
                                    heat_params["heat_fn"], heat_params["sample_file"],
                                    heat_params["gene_file"], full_index2gene.values(),
                                    heat_params["snv_file"], args.gene_length_file, args.bmr,
                                    args.bmr_file, heat_params["cna_file"], args.gene_order_file,
                                    heat_params["cna_filter_threshold"], heat_params["min_freq"],
                                    args.num_permutations, args.num_cores)
        elif args.permutation_type == "network":
            pass    #nothing to do right now
        elif args.permutation_type == "precomputed":
            heat_file_paths = [args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i))
                               for i in range(1, args.num_permutations+1)]
            heat_permutations = [hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths]
        else:
            raise ValueError("Unrecognized permutation type %s" % (args.permutation_type))
    
    for delta in args.deltas:
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        G = hn.weighted_graph(sim, index2gene, delta, not args.classic)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance
        if args.permutation_type != "none":
            if args.permutation_type == "network":
                sizes2stats = calculate_significance_network(args, args.permuted_networks_path,
                                                             full_index2gene, G, heat, delta,
                                                             args.num_permutations)
            else:
                sizes2stats = calculate_significance(args, infmat, full_index2gene, G, delta,
                                                     heat_permutations)
        
        #sort ccs list such that genes within components are sorted alphanumerically, and components
        #are sorted first by length, then alphanumerically by name of the first gene in the component 
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)
        
        #write output
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
        args.delta = delta  # include delta in parameters section of output JSON
        output_dict = {"parameters": vars(args), "heat_parameters": heat_params,
                       "sizes": hn.component_sizes(ccs), "components": ccs}
        if args.permutation_type != "none":
            output_dict["statistics"] = sizes2stats
            hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                           sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
Beispiel #11
0
def run(args):
    subnetworks_file = '%s/viz_files/%s' % (str(hotnet2.__file__).rsplit('/', 1)[0], VIZ_SUBNETWORKS)

    # create output directory if doesn't exist; warn if it exists and is not empty
    outdir = args.output_directory
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    if len(os.listdir(outdir)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")

    ks = set()
    output = dict(deltas=[], subnetworks=dict(), mutation_matrices=dict(), stats=dict())
    predictions = set()
    multipleHeatFiles = False
    for results_file in args.results_files:
        with open(results_file, 'r') as IN:
            results = json.load(IN)
            ccs = results['components']

        heat_file = json.load(open(results['parameters']['heat_file']))
        gene2heat = heat_file['heat']
        heat_parameters = heat_file['parameters']
        d_score = hnio.load_display_score_tsv(args.display_score_file) if args.display_score_file else None
        d_name = hnio.load_display_name_tsv(args.display_name_file) if args.display_name_file else dict()
        edges = hnio.load_ppi_edges(args.edge_file, hnio.load_index(results['parameters']['infmat_index_file']))
        delta = format(results['parameters']['delta'], 'g')
        output['deltas'].append(delta)

        output["subnetworks"][delta] = []
        predictions |= set( g for cc in ccs for g in cc )
        for cc in ccs:
            output['subnetworks'][delta].append(viz.get_component_json(cc, gene2heat, edges,
                                                                args.network_name, d_score, d_name))
        # Record the heat scores
        if 'geneToHeat' in output:
            if any( output['geneToHeat'][g] != h for g, h in gene2heat.iteritems() ) or len(gene2heat.keys()) != len(output['geneToHeat'].keys()):
                multipleHeatFiles = True
        output['geneToHeat'] = gene2heat

        # make oncoprints if heat file was generated from mutation data
        if 'heat_fn' in heat_parameters and heat_parameters['heat_fn'] == 'load_mutation_heat':
            output['mutation_matrices'][delta] = list()
            samples = hnio.load_samples(heat_parameters['sample_file']) if heat_parameters['sample_file'] else None
            genes = hnio.load_genes(heat_parameters['gene_file']) if heat_parameters['gene_file'] else None
            snvs = hnio.load_snvs(heat_parameters['snv_file'], genes, samples) if heat_parameters['snv_file'] else []
            cnas = hnio.load_cnas(heat_parameters['cna_file'], genes, samples) if heat_parameters['cna_file'] else []

            # Get the samples and genes from the mutations directly if they weren't provided
            if not samples:
                samples = set( m.sample for m in snvs ) | set( m.sample for m in cnas )
            if not genes:
                genes = set( m.gene for m in snvs) | set( m.gene for m in cnas )

            for cc in ccs:
                output['mutation_matrices'][delta].append(viz.get_oncoprint_json(cc, snvs, cnas, d_name))

            if heat_parameters.get('sample_type_file'):
                with open(heat_parameters['sample_type_file']) as f:
                    output['sampleToTypes'] = dict(l.rstrip().split() for l in f if not l.startswith("#") )
                    output['typeToSamples'] = dict((t, []) for t in set(output['sampleToTypes'].values()))
                    for s, ty in output['sampleToTypes'].iteritems():
                        output['typeToSamples'][ty].append( s )
            else:
                if not samples:
                    samples = set( m.sample for m in snvs ) | set( m.sample for m in cnas )
                output['sampleToTypes'] = dict( (s, "Cancer") for s in samples )
                output['typeToSamples'] = dict(Cancer=list(samples))

        output['stats'][delta] = results['statistics']
        ks |= set(map(int, results['statistics'].keys()))

    # Print a warning if there were multiple heat files referenced by
    # the results files
    if multipleHeatFiles:
        sys.stderr.write('Warning: results files used multiple heat files. Only the last heat file will be used to tabulate scores.\n')

    # Output to file
    output['predictions'] = sorted(predictions) # list of nodes found in any run
    output['ks'] = range(min(ks), max(ks)+1)
    with open('%s/subnetworks.json' % outdir, 'w') as out:
        json.dump(output, out, indent=4)

    shutil.copy(subnetworks_file, '%s/%s' % (outdir, VIZ_INDEX))
def run(args):
    subnetworks_file = '%s/viz_files/%s' % (str(hotnet2.__file__).rsplit(
        '/', 1)[0], VIZ_SUBNETWORKS)

    # create output directory if doesn't exist; warn if it exists and is not empty
    outdir = args.output_directory
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    if len(os.listdir(outdir)) > 0:
        print(
            "WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
            "(Ctrl-c to cancel).")

    ks = set()
    output = dict(deltas=[],
                  subnetworks=dict(),
                  mutation_matrices=dict(),
                  stats=dict())
    subnetworks = dict()
    for results_file in args.results_files:
        results = json.load(open(results_file))
        ccs = results['components']

        heat_file = json.load(open(results['parameters']['heat_file']))
        gene2heat = heat_file['heat']
        heat_parameters = heat_file['parameters']
        d_score = hnio.load_display_score_tsv(
            args.display_score_file) if args.display_score_file else None
        d_name = hnio.load_display_name_tsv(
            args.display_name_file) if args.display_name_file else dict()
        edges = hnio.load_ppi_edges(
            args.edge_file,
            hnio.load_index(results['parameters']['infmat_index_file']))
        delta = format(results['parameters']['delta'], 'g')
        output['deltas'].append(delta)
        subnetworks[delta] = ccs

        output["subnetworks"][delta] = []
        for cc in ccs:
            output['subnetworks'][delta].append(
                viz.get_component_json(cc, gene2heat, edges, args.network_name,
                                       d_score, d_name))

        # make oncoprints if heat file was generated from mutation data
        if 'heat_fn' in heat_parameters and heat_parameters[
                'heat_fn'] == 'load_mutation_heat':
            output['mutation_matrices'][delta] = list()
            samples = hnio.load_samples(
                heat_parameters['sample_file']
            ) if heat_parameters['sample_file'] else None
            genes = hnio.load_genes(heat_parameters['gene_file']
                                    ) if heat_parameters['gene_file'] else None
            snvs = hnio.load_snvs(
                heat_parameters['snv_file'], genes,
                samples) if heat_parameters['snv_file'] else []
            cnas = hnio.load_cnas(
                heat_parameters['cna_file'], genes,
                samples) if heat_parameters['cna_file'] else []

            # Get the samples and genes from the mutations directly if they weren't provided
            if not samples:
                samples = set(m.sample for m in snvs) | set(m.sample
                                                            for m in cnas)
            if not genes:
                genes = set(m.gene for m in snvs) | set(m.gene for m in cnas)

            for cc in ccs:
                output['mutation_matrices'][delta].append(
                    viz.get_oncoprint_json(cc, snvs, cnas, d_name))

            if heat_parameters.get('sample_type_file'):
                with open(heat_parameters['sample_type_file']) as f:
                    output['sampleToTypes'] = dict(l.rstrip().split()
                                                   for l in f
                                                   if not l.startswith("#"))
                    output['typeToSamples'] = dict(
                        (t, []) for t in set(output['sampleToTypes'].values()))
                    for s, ty in output['sampleToTypes'].iteritems():
                        output['typeToSamples'][ty].append(s)
            else:
                output['sampleToTypes'] = dict((s, "Cancer") for s in samples)
                output['typeToSamples'] = dict(Cancer=list(samples))

        output['stats'][delta] = results['statistics']
        ks |= set(map(int, results['statistics'].keys()))

    output['ks'] = range(min(ks), max(ks) + 1)
    with open('%s/subnetworks.json' % outdir, 'w') as out:
        json.dump(output, out, indent=4)

    shutil.copy(subnetworks_file, '%s/%s' % (outdir, VIZ_INDEX))
Beispiel #13
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")
    
    # load data
    infmat = scipy.io.loadmat(args.infmat_file)[args.infmat_name]  
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat, heat_params = hnio.load_heat_json(args.heat_file)
  
    # compute similarity matrix and extract connected components
    M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()))
    h = hn.heat_vec(heat, gene_index)
    sim = hn.similarity_matrix(M, h)
    
    # only calculate permuted data sets for significance testing once
    if args.permutation_type != "none":
        if args.permutation_type == "heat":
            print "* Generating heat permutations for statistical significance testing" 
            extra_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file \
                            else None
            heat_permutations = permutations.permute_heat(heat, args.num_permutations, extra_genes,
                                                          args.parallel)
        elif args.permutation_type == "precomputed":
            heat_file_paths = [args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i))
                               for i in range(1, args.num_permutations+1)]
            heat_permutations = [hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths]
        else:
            raise ValueError("Unrecognized permutation type %s" % (args.permutation_type))
    
    for delta in args.deltas:
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        G = hn.weighted_graph(sim, gene_index, delta)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance
        if args.permutation_type != "none":
            sizes2stats = calculate_significance(args, infmat, infmat_index, G, delta, heat_permutations)
        
        #sort ccs list such that genes within components are sorted alphanumerically, and components
        #are sorted first by length, then alphanumerically by name of the first gene in the component 
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)
        
        #write output
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
        args.delta = delta
        output_dict = {"parameters": vars(args), "heat_parameters": heat_params,
                       "sizes": hn.component_sizes(ccs), "components": ccs}
        if args.permutation_type != "none":
            output_dict["statistics"] = sizes2stats
            hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                           sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()