Example #1
0
def main():
    network_file = "interactions.sif"
    seed_file = "seeds.txt"
    scoring_folder = "./test/"
    executable_path = "/home/emre/arastirma/netzcore/src/scoreNetwork/scoreN"
    # Create input files for scoring
    prepare_scoring(network_file, seed_file, scoring_folder, non_seed_score=0.01, seed_score=1.0, edge_score=1.0, n_sample=100, delim=" ")
    # Run GUILD and create output files, the case for Netcombo
    run_scoring(scoring_folder, executable_path, scoring_type="netcombo")
    #run_scoring(scoring_folder, executable_path, scoring_type="netzcore", parameters={"n_iteration":5, "n_sample":100, "sampling_prefix":scoring_folder+"sampled_graph."}, qname=None)
    # Generate cross validation files
    node_scores_file = scoring_folder + "node_scores.sif"
    edge_scores_file = scoring_folder + "edge_scores_netshort.sif"

    # fill the code to get nodes, seed_to_score, edges and edge_to_score variables below
    g = network_utilities.create_network_from_sif_file(network_file, use_edge_data=True)
    seeds = guild_utilities.get_nodes(seed_file)
    nodes = g.nodes()
    edges = g.edges()
    seed_to_score = dict([(node, 1) for node in seeds])
    edge_to_score = dict([((u,v), 1) for u,v in edges])

    guild_utilities.generate_cross_validation_node_score_files(nodes, seed_to_score, node_scores_file, xval = 3, default_score = 0.01, replicable = 123)

    guild_utilities.generate_cross_validation_edge_score_as_node_score_files(edges, seed_to_score, edge_to_score, edge_scores_file, xval = 3, default_score = 0.01, replicable = 123)

    # Run NetScore on these cross validation files
    guild_utilities.run_scoring(scoring_folder, executable_path, scoring_type="netscore", parameters={"n_iteration":2, "n_repetition":3}, qname=None, calculate_pvalue=True, xval=3)
    return
Example #2
0
def get_number_of_seed_connecting_edges():
    from toolbox import network_utilities as gu
    #base_dir = DATA_DIR + "input/biana_no_tap_no_reliability/"
    base_dir = DATA_DIR + "input/goh/"
    network_file = base_dir + "edge_scores.sif"
    output_file = base_dir + "edge_counts.txt"
    g = gu.create_network_from_sif_file(network_file, use_edge_data=True)
    f = open(output_file, 'w')
    n = float(g.number_of_edges())
    for phenotype in omim_phenotypes:
	node_file = base_dir + phenotype + "/seed_scores.sif"
	seeds = [ line.strip().split()[0] for line in open(node_file) ]
	n_seed = 0
	n_seed_nonseed = 0
	n_nonseed = 0
	for u, v in g.edges():
	    if u in seeds and v in seeds:
		n_seed += 1
	    elif u in seeds or v in seeds:
		n_seed_nonseed += 1
	    else:
		n_nonseed += 1
	#n_seed /= n
	#n_seed_nonseed /= n
	#n_nonseed /= n
	f.write("%s %d %d %d\n" % (phenotype, n_seed, n_seed_nonseed, n_nonseed))
    f.close()
    return
Example #3
0
def get_number_of_seed_connecting_paths():
    from toolbox import network_utilities as gu
    #base_dir = DATA_DIR + "input/biana_no_tap_no_reliability/"
    base_dir = DATA_DIR + "input/goh/"
    network_file = base_dir + "edge_scores.sif"
    output_file = base_dir + "path_counts.txt"
    g = gu.create_network_from_sif_file(network_file, use_edge_data=True)
    f = open(output_file, 'w') 
    for phenotype in omim_phenotypes:
	print phenotype
	node_file = base_dir + phenotype + "/seed_scores.sif"
	seeds = [ line.strip().split()[0] for line in open(node_file) ]
	n = float(len(seeds))
	count = 0
	path_length = 0
	for i, seed1 in enumerate(seeds):
	    for j, seed2 in enumerate(seeds):
		if i<j:
		    #count += len(find_all_paths(g, seed1, seed2, []))
		    for path in all_shortest_paths(g, seed1, seed2):
			count += 1
			path_length += len(path) - 1
	#print count, count/n, float(path_length)/count
	f.write("%s %d %f %f\n" % (phenotype, count, count/n, float(path_length)/count))
    f.close()
    return
Example #4
0
def output_edge_pvalue_file(network_file, score_file, background_file, seed_file=None, background_seed_file=None, delim=" "):
    """
    Calculates and outputs edge p-values using GUILD scores (node scores are first converted to edge scores and then edge p-values are calculated)
    """
    g = network_utilities.create_network_from_sif_file(network_file)
    node_to_score = get_node_to_score(score_file)
    background_to_score = get_node_to_score(background_file)
    seed_to_score = None
    background_seed_to_score = None
    #if seed_file is not None:
    #	seed_to_score = get_node_to_score(seed_file)
    if background_seed_file is not None:
	background_seed_to_score = get_node_to_score(background_seed_file)
    edge_to_score = {}
    background_edge_to_score = {}
    for u, v in g.edges():
	edge_to_score[(u,v)] = (node_to_score[u] + node_to_score[v]) / 2
	if u in background_seed_to_score or v in background_seed_to_score:
	    continue
	background_edge_to_score[(u,v)] = (background_to_score[u] + background_to_score[v]) / 2
    node_to_significance = get_significance_among_node_scores(edge_to_score, background_edge_to_score)
    values = [ (v, k) for k,v in node_to_significance.iteritems() ]
    values.sort()
    i = 0
    f = open(score_file + ".edge_pval", 'w')
    f.write("Id1%sId2%sScore%sP-value\n" % (delim, delim, delim)) 
    for val, edge in values:
	f.write("%s%s%s%s%f%s%s\n" % (edge[0], delim, edge[1], delim, edge_to_score[edge], delim, str(val)))
	i += 1
    f.close()
    return
def calculate_proximity_multiple(parameter_file_prefix, i_start, i_end):
    network_file, nodes_from, nodes_to, out_file, min_bin_size, n_random, n_seed = get_parameters_from_file(
        parameter_file_prefix + "%s.txt" % i_start)
    network = network_util.create_network_from_sif_file(
        network_file,
        use_edge_data=False,
        delim=None,
        include_unconnected=True)
    bins = network_util.get_degree_binning(network, min_bin_size, lengths=None)
    for i in range(i_start, i_end):
        if not os.path.exists(parameter_file_prefix + "%s.txt" % i):
            print("File does not exists for index (aborting):", i)
            break
        network_file, nodes_from, nodes_to, out_file, min_bin_size, n_random, n_seed = get_parameters_from_file(
            parameter_file_prefix + "%s.txt" % i)
        if os.path.exists(out_file):
            print("Skipping existing file for index:", i)
            continue
        print(network_file, nodes_from, nodes_to, n_random, min_bin_size,
              n_seed, out_file)
        values = wrappers.calculate_proximity(network,
                                              nodes_from=nodes_from,
                                              nodes_to=nodes_to,
                                              bins=bins,
                                              n_random=n_random,
                                              min_bin_size=min_bin_size,
                                              seed=n_seed)
        if values is not None:  # not in network
            d, z, (m, s) = values
            # print z, d, (m, s)
            open(out_file, 'w').write("%f %f %f %f\n" % (z, d, m, s))
    return
Example #6
0
def get_differential_network(g_org, ueid_to_gene, auc_file, critical_auc):
    from toolbox import network_utilities as gu
    from statsmodels.stats.weightstats import ttest_ind

    # Get indices of min/max networks
    aucs = []
    for line in open(auc_file):
	aucs.append(float(line.split()[1]))
    indices = zip(*sorted([ (auc, i) for i, auc in enumerate(aucs) ]))[1]
    #print indices[:3], indices[-3:] # real index in file name is one higher
    indices_max = indices[-2:]
    indices_min = indices[:2]

    # Get max neighborhood network
    g_maxs = [ ] 
    for i in indices_max:
	network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.%d" % (i+1)
	g = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False)
	g_maxs.append(g) 
	##g_maxs.append(g.subgraph(g_neighborhood.nodes()))

    # Get min neighborhood network
    g_mins = [ ] 
    for i in indices_min:
	network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.%d" % (i+1)
	g = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False)
	g_mins.append(g)
	##g_mins.append(g.subgraph(g_neighborhood.nodes()))

    print len(g_maxs), len(g_mins)

    # Get common edges in min/max networks
    g_max = reduce(lambda x,y: gu.networkx.intersection(x, y), g_maxs)
    g_min = reduce(lambda x,y: gu.networkx.intersection(x, y), g_mins)

    # Get differential edges
    g_diff = gu.networkx.difference(g_max, g_min)
    print len(g_max.edges()), len(g_min.edges()), len(g_diff.edges())

    nodes = set()
    for node in g_diff.nodes():
	if node in ueid_to_gene:
	    nodes.add(node)
    g_sub = g_diff.subgraph(nodes)
    return g_sub
Example #7
0
def analyze_network():
    from toolbox import network_utilities as gu
    g = gu.create_network_from_sif_file(
        "/data/emre/toy_data/test_interactions_small.sif")
    degrees = g.degree(with_labels=True)
    node_to_values = gu.get_node_degree_related_values(g, ["v2", "v3"])
    for v in g.nodes():
        print v, degrees[v], node_to_values[v]
    gu.create_R_analyze_network_script(g, ["v2", "v3"])
Example #8
0
def get_neighbors_of_nodes_in_network(network_file, node_file, output_file):
    from toolbox import network_utilities as gu
    g = gu.create_network_from_sif_file(network_file, use_edge_data=True)
    nodes = [ line.strip() for line in open(node_file) ]
    neighbors = []
    for node in nodes:
	neighbors.extend(g.neighbors(node))
    neighbors.extend(nodes)
    g_sub = g.subgraph(neighbors)
    f = open(output_file, 'w')
    for u,v,w in g_sub.edges(data=True):
	f.write("%s %s %s\n" % (u,w,v))
    f.close()
    return
Example #9
0
def main():
    print("LifeArc wrapper")
    network_file = "/Users/woochanghwang/PycharmProjects/LifeArc/General/src_drug/Data/human_protein_interactome.sif"
    disease_gene_file = "/Users/woochanghwang/PycharmProjects/LifeArc/General/src_drug/Data/disease_genes.tsv"
    drug_target_file = "/Users/woochanghwang/PycharmProjects/LifeArc/General/src_drug/Data/drug_target_interactions.txt"

    network = nu.create_network_from_sif_file(network_file, use_edge_data=False, delim=None,
                                                        include_unconnected=True)
    nodes = set(network.nodes())
    print("network lengths",len(nodes))
    drug_to_targets = get_drug_target_drugbank(drug_target_file, nodes=nodes)
    print(drug_to_targets)
    # disease_to_genes, disease_to_category = get_diseasome_genes(disease_gene_file, nodes=nodes)
    # gene_list_file =  "/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK1_gene_score_by_RW_pvalue_FC_230119.tsv"

    disease_name = 'GBM'
    # disease_to_genes, disease_to_category = get_diseasome_genes_from_selectedGenes(gene_list_file, disease_name,
    #                                                                                disease_category=None, nodes=nodes)
    #
    # print("network edges:", network.edges())
    # output_file = "{}_drug_proximity.tsv".format(disease_name)
    # calculate_proximity_multiple(network, from_file=drug_target_file, to_file=gene_list_file, disease_mode=disease_name,
    #                              out_file=output_file)

    #######################################
    ## Temp for ULK1,ULK2
    #######################################
    gene_list_file_ulk1 = "/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK1_gene_score_by_RW_pvalue_FC_230119.tsv"
    gene_list_file_ulk2 = "/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK2_gene_score_by_RW_pvalue_FC_230119.tsv"
    gene_list_ulk1 = get_gene_list_from_file("/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK1_gene_score_by_RW_pvalue_FC_230119.tsv")
    gene_list_ulk2 = get_gene_list_from_file("/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK2_gene_score_by_RW_pvalue_FC_230119.tsv")

    gene_list = list(set(gene_list_ulk1).union(set(gene_list_ulk2)))
    disease_name = 'GBM'
    disease_to_genes , disease_to_category = get_diseasome_genes_from_selectedGenes(gene_list, disease_name, disease_category=None, nodes=nodes)
    print("disease_gene", disease_to_genes)

    print("network edges:", network.edges())
    output_file = "/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/drug/{}_drug_proximity_{}.tsv".format(disease_name,"ULK1_2")
    calculate_proximity_multiple(network,from_file=drug_target_file, to_file=gene_list ,disease_mode = disease_name, out_file=output_file)
Example #10
0
def case_study_pruned_networks_old():
    """
    cat /sbi/users/emre/data/netzcore/from_gaudi_2011/output_runs_on_random/biana_no_tap_no_reliability_pruned_p50_*/omim_breast_cancer/ns/r3i2/auc.txt > arastirma/netzcore/data/summary_runs_on_random/breast_cancer_pruned_p80.txt
    vi %s/"//g
    d<-read.table("breast_cancer_pruned_p50.txt")
    e<-d$V2
    f<-(e-mean(e))/sd(e)
    which(e %in% sort(e)[98:100])
    > 22 54 58
    which(e %in% sort(e)[1:3])
    > 7 39 55 99
    """
    from toolbox import network_utilities as gu
    from toolbox import functional_enrichment

    network_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/edge_scores.sif"
    user_entity_id_mapping_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/node_mapping.tsv.genesymbol.single"
    seeds_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/omim_breast_cancer/seed_scores.sif"
    network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.58"
    network_file_pruned2 = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.7"
    module_file = DATA_DIR + "module/biana_no_tap-omim/mcl/modules.txt"
    #network_file_permuted = DATA_DIR + "human_interactome_biana/permuted/50/sampled_graph.sif.46"
    ueid_to_gene = get_ues_gene_mapping(user_entity_id_mapping_file)
    seeds = set([line.strip().split()[0] for line in open(seeds_file)])
    g = gu.create_network_from_sif_file(network_file, use_edge_data=False)
    g_neighborhood = gu.get_neighborhood_subgraph(g, seeds)
    neighborhood_edges = set(g_neighborhood.edges()) # edge node order may be different for the same edge
    #g_sub_pruned = gu.get_neighborhood_subgraph(g_pruned, seeds)
    #print len(g_sub.nodes()), len(g_sub.edges())
    #print len(g_sub_pruned.nodes()), len(g_sub_pruned.edges())
    g_pruned = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False)
    g_pruned2 = gu.create_network_from_sif_file(network_file_pruned2, use_edge_data=False)
    #weak_edges = set(g.edges()) - set(g_pruned.edges())
    strong_edges = set(g_pruned.edges())
    #strong_edges = set(g.edges()) - set(g_pruned2.edges())
    weak_edges = set(g_pruned2.edges())
    common_edges = weak_edges & strong_edges
    weak_edges -= common_edges
    strong_edges -= common_edges
    weak_edges &= neighborhood_edges
    strong_edges &= neighborhood_edges
    #print len(weak_edges), len(strong_edges), len(common_edges)
    g_sub = gu.create_graph()
    #g_sub.add_edges_from(weak_edges | strong_edges)

    #strong_edges = weak_edges # To check differential network from the other side (edges in min but not in max)

    g_sub.add_edges_from(strong_edges) 
    weak_edges = set() 

    go = functional_enrichment.get_go_ontology("/home/emre/arastirma/celldiff/data/GO/gene_ontology.1_2.obo")

    # Run scoring on pruned networks
    if False:
	from toolbox import guild_utilities
	data_dir = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/" 
	executable_path = "scoreNetwork/scoreN"
	for network_type in ("pruned_max", "pruned_min"):
	    if network_type == "pruned_max":
		network_file = network_file_pruned
	    elif network_type == "pruned_min":
		network_file = network_file_pruned2
	    else:
		raise ValueError("Unknown network type!")
	    scoring_folder = data_dir + network_type + os.sep
	    # Create input files for scoring
	    guild_utilities.prepare_scoring(network_file, seeds_file, scoring_folder, non_seed_score=0.01, seed_score=1.0, edge_score=1.0, n_sample=1, delim=" ", name=None)
	    # Run GUILD and create output files, the case for Netcombo
	    guild_utilities.run_scoring(scoring_folder, executable_path, scoring_type="netscore", parameters={"n_iteration":2, "n_repetition":3}, qname=None, name=None, calculate_pvalue=False)

    # Get functions of high scoring portions in 3 networks 
    network_types = ("original", "pruned_max", "pruned_min")
    if False:
	import analyze_results
	association_scores_file_identifier_type = "genesymbol"
	node_mapping_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/node_mapping.tsv"
	node_mapping_file += "."+association_scores_file_identifier_type
	node_scores_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/omim_breast_cancer/node_scores.sif"
	for network_type in network_types:
	    if network_type == "original":
		output_scores_file = DATA_DIR + "output_runs_for_draft/biana_no_tap_no_reliability/omim_breast_cancer/ns/r3i2/node_scores.sif"
	    else: # network_type in ("pruned_max", "pruned_min"):
		output_scores_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/%s/output_scores.sif.netscore" % network_type
	    enrichment_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/%s.txt" % network_type
	    file_enrichment = open(enrichment_file, 'w')
	    analyze_results.check_functional_enrichment_at_given_cutoff(output_scores_file, node_scores_file, node_mapping_file, "5%", association_scores_file_identifier_type, file_enrichment.write, 0.01, exclude_seeds=False, specie = "H**o sapiens")
	    file_enrichment.close()

    # Get functions of the top scroing portions for all diseases
    if False: 
    	phenotype_to_functions = get_go_function_counts() 
    	seed_terms = phenotype_to_functions["omim_breast_cancer"][0]

	all_terms = set()
	common_terms = None
	network_to_terms = {}
	#network_types = network_types[:2]
	for network_type in network_types:
	    enrichment_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/%s.txt" % network_type
	    go_terms = functional_enrichment.get_functional_enrichment(enrichment_file, go, remove_parents=False, only_biological_processes=True, only_slim=False)
	    network_to_terms[network_type] = set() | go_terms
	    print network_type, len(go_terms)
	    all_terms |= go_terms
	    if common_terms is None:
		common_terms = go_terms
	    else:
		common_terms &= go_terms
	all_terms |= seed_terms
	common_terms &= seed_terms
	#seed_terms = functional_enrichment.remove_parent_terms(seed_terms, go) 
	#all_terms = functional_enrichment.remove_parent_terms(all_terms, go)
	#common_terms = functional_enrichment.remove_parent_terms(common_terms, go)
	print len(all_terms), len(common_terms)

	for network_type in network_types:
	    for network_type2 in network_types:
		if network_type == network_type2:
		    continue
		print network_type, network_type2, len(all_terms & network_to_terms[network_type] & network_to_terms[network_type2])

	f = open(DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/functional_comparison.dat", 'w')
	f.write("seed terms\t%s\n" % "\t".join(network_types))
	for go_term in all_terms:
	    values = []
	    if go_term in seed_terms: 
		val = 1
	    else:
		val = 0
	    values.append(val)
	    for network_type in network_types:
		val = 0
		if go_term in network_to_terms[network_type]:
		    val= 1
		values.append(val)
	    f.write("%s\t%s\n" % (go.node[go_term]['n'], "\t".join(map(str, values))))
	f.close()

    if False:
	n = float(len(seeds))
	for graph in (g, g_pruned, g_pruned2):
	    count = 0
	    path_length = 0
	    n_path = 0
	    n_pair = 0.0
	    for i, seed1 in enumerate(seeds):
		for j, seed2 in enumerate(seeds):
		    if i<j:
			#count += len(find_all_paths(g, seed1, seed2, []))
			try:
			    paths = all_shortest_paths(graph, seed1, seed2)
			    for path in paths:
				count += 1
				path_length += len(path) - 1
			    n_pair += 1
			    n_path += len(path) - 1
			except:
			    continue
	    print n, n_pair, n_path/n_pair, count, count/n, count/n_pair, path_length/float(count)
	return

    # Check seed interaction counts on pruned max
    if False:
	output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_seeds.txt"
	f = open(output_file, 'w')
	for seed in seeds:
	    if seed not in ueid_to_gene:
		print seed
		continue
	    f.write("%s\n" % ueid_to_gene[seed])
	f.close()

	for network_type, graph in zip(network_types, [g, g_pruned, g_pruned2]):
	    output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/%s_seed_interaction_counts.txt" % network_type
	    g_neighborhood = gu.get_neighborhood_subgraph(graph, seeds)
	    f = open(output_file, 'w')
	    nodes = set()
	    for node in g_neighborhood.nodes():
		if node in ueid_to_gene:
		    nodes.add(node)
		    if node in seeds:
			f.write("%s\t%d\n" % (ueid_to_gene[node], g_neighborhood.degree(node)))
	    f.close()
	    #g_neighborhood = g_neighborhood.subgraph(nodes)    
	    #output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p0.dot"
	    #gu.create_dot_network_file(g_neighborhood, output_file, seeds, ueid_to_gene, draw_type="all")
	    #os.system("twopi -Tgif -O %s" % output_file)

    # Check modules in pruned max
    if False:
	nodes = set()
	for node in g_sub.nodes():
	    if node in ueid_to_gene:
		nodes.add(node)
	g_sub = g_sub.subgraph(nodes)

	output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_strong.dot"
	#gu.create_dot_network_file(g_sub, output_file, seeds, ueid_to_gene, weaks=weaks, draw_type="weak")
	#gu.create_dot_network_file(g_sub_pruned, output_file, seeds, ueid_to_gene, weaks=weaks, draw_type="weak")
	gu.create_dot_network_file(g_sub, output_file, seeds, ueid_to_gene, weak_edges=weak_edges, draw_type="all")
	os.system("twopi -Tgif -O %s" % output_file)
	
	from toolbox import mcl_utilities as mcl
	modules = mcl.get_modules_from_file(module_file)
	
	output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_modularized_strong.nnf"
	f = open(output_file, 'w')

	network_name = "breast_cancer_pruned_p80_"
	module_sets = []
	included_nodes = set()
	included_edges = set()
	# Output modules
	for i, module in enumerate(modules):
	    m = set(module) & nodes
	    if len(m) > 0:
		module_sets.append(m)
		f.write("%s M%d_\n" % (network_name, i))
		for u, v in g_sub.edges(m):
		    if u == v:
			continue
		    if u in m and v in m:
			w = "pp"
			if (u,v) in weak_edges or (v,u) in weak_edges:
			    w = "weak"
			elif (u,v) in strong_edges or (v,u) in strong_edges:
			    w = "strong"
			included_nodes.add(u)
			included_nodes.add(v)
			included_edges.add((u,v))
			included_edges.add((v,u))
			u = ueid_to_gene[u]
			v = ueid_to_gene[v]
			f.write("M%d_ %s %s %s\n" % (i, u, w, v))
		for u in m:
		    if u not in included_nodes:
			included_nodes.add(u)
			u = ueid_to_gene[u]
			f.write("M%d_ %s\n" % (i, u))
	# Connect modules
	for i, module1 in enumerate(module_sets):
	    for j, module2 in enumerate(module_sets):
		if i<j:
		    connected_weak = False
		    connected_strong = False
		    for u in module1:
			for v in module2:
			    if (u,v) in strong_edges:
				connected_strong = True
				break
			    if (u,v) in weak_edges:
				connected_weak = True
		    if connected_strong:
			f.write("%s M%d_ %s M%d_\n" % (network_name, i, "strong", j))
		    elif connected_weak:
			f.write("%s M%d_ %s M%d_\n" % (network_name, i, "weak", j))
	# Output the rest
	for u,v in g_sub.edges():
	    if u == v:
		continue
	    if (u,v) in included_edges:
		continue
	    included_nodes.add(u)
	    included_nodes.add(v)
	    w = "pp"
	    if (u,v) in weak_edges or (v,u) in weak_edges:
		w = "weak"
	    elif (u,v) in strong_edges or (v,u) in strong_edges:
		w = "strong"
	    u = ueid_to_gene[u]
	    v = ueid_to_gene[v]
	    f.write("%s %s %s %s\n" % (network_name, u,w,v))
	for node in nodes - included_nodes:
	    f.write("%s %s\n" % (network_name, ueid_to_gene[node]))
	f.close()

	output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_strong.sif"
	f = open(output_file, 'w')
	included_nodes = set()
	for u,v in g_sub.edges():
	    if u == v:
		continue
	    included_nodes.add(u)
	    included_nodes.add(v)
	    w = "pp"
	    if (u,v) in weak_edges or (v,u) in weak_edges:
		w = "weak"
	    elif (u,v) in strong_edges or (v,u) in strong_edges:
		w = "strong"
	    u = ueid_to_gene[u]
	    v = ueid_to_gene[v]
	    f.write("%s %s %s\n" % (u,w,v))
	for node in nodes - included_nodes:
	    f.write("%s\n" % ueid_to_gene[node])
	f.close()

	for i, module in enumerate(module_sets):
	    output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/M%d_.txt" % i
	    #g_sub_sub = g_sub.subgraph(module) # For checking the functions enriched in the largest connected component of the module
	    #module = gu.get_connected_components(g_sub_sub, return_as_graph_list=False)[0] 
	    f = open(output_file, 'w')
	    for node in module:
		f.write("%s\n" % ueid_to_gene[node])
	    f.close()
	    functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file, output_file+".funcassoc")

    if False:
	for i in range(4):
	    enrichment_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/" + "%s/M%d_.txt.funcassoc" % ("strong/func-all", i) # strong/func-all strong weak
	    go_terms = functional_enrichment.get_functional_enrichment(enrichment_file, go, remove_parents=False, only_biological_processes=True)
	    print "m%d<-c(\"%s\")" % (i, "\", \"".join(go_terms))
    return
Example #11
0
def case_study_pruned_networks():
    from toolbox import network_utilities as gu
    from toolbox import functional_enrichment
    from toolbox import mcl_utilities as mcl
    from scipy.stats import hypergeom

    network_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/edge_scores.sif"
    user_entity_id_mapping_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/node_mapping.tsv.genesymbol.single"
    seeds_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/omim_breast_cancer/seed_scores.sif"
    auc_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80.txt"
    module_file = DATA_DIR + "module/biana_no_tap-omim/mcl/modules.txt"

    # Get node mapping
    ueid_to_gene = get_ues_gene_mapping(user_entity_id_mapping_file)
    # Get seeds
    seeds = set([line.strip().split()[0] for line in open(seeds_file)])
    # Get neighborhood in the original network
    g_org = gu.create_network_from_sif_file(network_file, use_edge_data=False)
    g_neighborhood = gu.get_neighborhood_subgraph(g_org, seeds)
    #neighborhood_edges = set(g_neighborhood.edges()) # edge node order may be different for the same edge

    critical_auc = 0.634
    g_sub = get_differential_network(g_org, ueid_to_gene, auc_file, critical_auc)

    # Get seed GOs to check their coverage in top connected component
    phenotype_to_functions = get_go_function_counts() 
    seed_terms = phenotype_to_functions["omim_breast_cancer"][0]

    go = functional_enrichment.get_go_ontology("/home/emre/arastirma/celldiff/data/GO/gene_ontology.1_2.obo")

    # Get network genes
    network_genes = set() # set(ueid_to_gene.values())
    seed_genes = set()
    for node in g_org.nodes():
	if node in ueid_to_gene:
	    network_genes.add(ueid_to_gene[node])
	    if node in seeds:
		seed_genes.add(ueid_to_gene[node])
    # Get current (up-to-date) seed GO terms
    output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/seed_genes.txt" 
    #functional_enrichment.check_functional_enrichment(list(seed_genes), list(network_genes), "genesymbol", open(output_file+".funcassoc", 'w').write) 
    #seed_go_terms = functional_enrichment.get_functional_enrichment(output_file + ".funcassoc", go, remove_parents=False, only_biological_processes=True)
    #print "current seed go:", len(seed_go_terms)

    # Get all functions enriched in the network
    output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/network_genes.txt"
    f = open(output_file, 'w')
    [ f.write("%s\n" % gene) for gene in network_genes ]
    f.close()
    #functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file, output_file+".funcassoc")
    network_go_terms = functional_enrichment.get_functional_enrichment(output_file + ".funcassoc", go, remove_parents=False, only_biological_processes=True)
    print 23928, len(network_go_terms)
    
    # Check the functions enriched in the largest connected component of each module
    for i, module in enumerate(gu.get_connected_components(g_sub, return_as_graph_list=False)):
	if len(module) < 10:
	    continue
	output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/M%d" % i

	if i == 0:
	    f = open(output_file + ".txt.seeds", 'w')
	    f2 = open(output_file + ".txt.nonseeds", 'w')
	    module_seed_genes = set()
	    module_nonseed_genes = set()
	    for node in module:
		if True: #node in ueid_to_gene:
		    if node in seeds:
			f.write("%s\n" % ueid_to_gene[node])
			module_seed_genes.add(ueid_to_gene[node])
		    else:
			f2.write("%s\n" % ueid_to_gene[node])
			module_nonseed_genes.add(ueid_to_gene[node])
	    f.close()
	    f2.close()
	    ##functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file + "seeds.txt", output_file + "seeds.txt.funcassoc")
	    functional_enrichment.check_functional_enrichment(list(module_seed_genes), list(network_genes), "genesymbol", open(output_file+".txt.seeds.funcassoc", 'w').write) 
	    ##functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file + "nonseeds.txt", output_file + "nonseeds.txt.funcassoc")
	    functional_enrichment.check_functional_enrichment(list(module_nonseed_genes), list(network_genes), "genesymbol", open(output_file+".txt.nonseeds.funcassoc", 'w').write) 
	    go_terms_seeds = functional_enrichment.get_functional_enrichment(output_file + ".txt.seeds.funcassoc", go, remove_parents=False, only_biological_processes=True)
	    go_terms_nonseeds = functional_enrichment.get_functional_enrichment(output_file + ".txt.nonseeds.funcassoc", go, remove_parents=False, only_biological_processes=True)
	    print len(seed_terms), len(go_terms_nonseeds), len(seed_terms & go_terms_nonseeds), len(seed_terms & go_terms_nonseeds) / float(len(seed_terms))
	    print "p_value:", sum(hypergeom.pmf(range(len(seed_terms & go_terms_nonseeds),len(go_terms_nonseeds)+1), len(network_go_terms), len(seed_terms), len(go_terms_nonseeds)))

	# Draw diff network component
	weak_edges = set()
	g_sub_sub = g_sub.subgraph(module)
	gu.create_dot_network_file(g_sub_sub, output_file + ".dot", seeds, ueid_to_gene, weak_edges=weak_edges, draw_type="all")
	gu.output_network_in_sif(g_sub_sub, output_file + ".sif", ueid_to_gene, delim = " ", include_unconnected=True)
	os.system("fdp -Tgif -O %s" % (output_file + ".dot")) 
	# Get functions
	f = open(output_file + ".txt", 'w')
	module_genes = set()
	for node in module:
	    f.write("%s\n" % ueid_to_gene[node])
	    module_genes.add(ueid_to_gene[node])
	f.close()
	##functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file + ".txt", output_file + ".txt.funcassoc")
	functional_enrichment.check_functional_enrichment(list(module_genes), list(network_genes), "genesymbol", open(output_file+".txt.funcassoc", 'w').write) 
	go_terms = functional_enrichment.get_functional_enrichment(output_file + ".txt.funcassoc", go, remove_parents=False, only_biological_processes=True)
	print len(seed_terms), len(go_terms), len(seed_terms & go_terms), len(seed_terms & go_terms) / float(len(seed_terms))
	print "p_value:", sum(hypergeom.pmf(range(len(seed_terms & go_terms),len(go_terms)+1), len(network_go_terms), len(seed_terms), len(go_terms)))
	
    weak_edges = set()
    strong_edges = g_sub.edges()
    # Draw diff neighborhood network
    output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_diff"
    gu.create_dot_network_file(g_sub, output_file + ".dot", seeds, ueid_to_gene, weak_edges=weak_edges, draw_type="all")
    gu.output_network_in_sif(g_sub, output_file + ".sif", ueid_to_gene, delim = " ", include_unconnected=True)
    os.system("fdp -Tgif -O %s" % (output_file + ".dot")) 

    all_terms = seed_terms #(go_terms_seeds | seed_terms | go_terms_nonseeds)
    file_name = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/functional_comparison"
    functional_enrichment.output_go_terms_and_levels(all_terms, go, file_name+"_goids.dat")
    f = open(file_name+".dat", 'w')
    #f.write("GO id\tGO term\tAll seeds\tModule seeds\tModule non-seeds\n") #"seed GO terms\tmodule GO terms (w/out seeds)\n"
    f.write("GO id\tGO term\tModule seeds\tModule non-seeds\n") #"seed GO terms\tmodule GO terms (w/out seeds)\n"
    term_list = [go_terms_seeds, go_terms_nonseeds] #[seed_terms, go_terms_seeds, go_terms_nonseeds]
    for go_term in all_terms:
	values = []
	for terms in term_list:
	    if go_term in terms: 
		val = 1
	    else:
		val = 0
	    values.append(val)
	f.write("%s\t%s\t%s\n" % (go_term, go.node[go_term]['n'], "\t".join(map(str, values))))
    f.close()
    return
Example #12
0
def get_differential_network_using_all_networks(g_org, ueid_to_gene, auc_file, critical_auc):
    from toolbox import network_utilities as gu
    from statsmodels.stats.weightstats import ttest_ind

    # Get indices of min/max networks
    aucs = []
    for line in open(auc_file):
	aucs.append(float(line.split()[1]))
    indices_max = [] 
    indices_min = [] 
    for i, auc in enumerate(aucs):
    	if auc >= critical_auc:
    	    indices_max.append(i)
    	else:
    	    indices_min.append(i)

    # Get max neighborhood network
    g_maxs = [ ] 
    for i in indices_max:
	network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.%d" % (i+1)
	g = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False)
	g_maxs.append(g) 

    # Get min neighborhood network
    g_mins = [ ] 
    for i in indices_min:
	network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.%d" % (i+1)
	g = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False)
	g_mins.append(g)

    print len(g_maxs), len(g_mins)

    #f=open("test.dat", 'w')
    #f.write("stat pval\n")
    g_diff = networkx.Graph()
    for u, v in g_org.edges():
	if u == v:
	    continue
	values_max = []
	values_min = []
	for i, g in enumerate(g_maxs):
	    if g.has_edge(u,v):
		values_max.append(1)
	    else:
		values_max.append(0)
	for i, g in enumerate(g_mins):
	    if g.has_edge(u,v):
		values_min.append(1)
	    else:
		values_min.append(0)
	vals = ttest_ind(values_max, values_min, usevar="separate")
	stat, pval = vals[:2]
	#f.write("%s.%s %s %s\n" % (u, v, stat, pval))
	if pval <= 0.05:
	    if stat > 0:
		g_diff.add_edge(u, v)
    #f.close()

    nodes = set()
    for node in g_diff.nodes():
	if node in ueid_to_gene:
	    nodes.add(node)
    g_sub = g_diff.subgraph(nodes)
    return g_sub
Example #13
0
def prepare_scoring(network_file,
                    seed_file,
                    scoring_folder="./",
                    non_seed_score=0.01,
                    seed_score=1.0,
                    edge_score=1.0,
                    n_sample=100,
                    delim=" ",
                    name=None):
    """
	Creates input files required by GUILD executable.

	network_file: network in sif-like format where edge type is edge score (e.g., "A 0.5 B" or "A pp B")
	seed_file: seeds in text format where nodes and their scores are given (e.g., "A 0.1" or "A")
	scoring_folder: path to directory where the input/output files will be created
	non_seed_score: initial scores of non-seeds (0.01, by default)
	seed_score: initial scores of seeds (1.0, by default)
	edge_score: weight of edges, in case the values in network_file is not convertable to float  (1.0, by default)
	n_sample: number of randomly generated graphs for netzcore (100, by default)
	delim: delimiter that separates columns in input/output files (" ", by default)
	name: optional name defining the phenotype, the scoring files will created under this dir (in case of multiple phenotype analysis)
    """
    if not os.path.exists(scoring_folder):
        os.mkdir(scoring_folder)
    if name is not None:
        if not os.path.exists(scoring_folder + name):
            os.mkdir(scoring_folder + name)
        name += os.sep
    else:
        name = ""

    # Read node info from network file (use network file as edge file)
    print "Creating edge score file"
    edge_score_file = scoring_folder + "edge_scores.sif"  #network_file.split("/")[-1] + ".converted"
    if os.path.exists(edge_score_file):
        print "\tEdge score file exists, overwriting!"
    nodes, edges, dummy, edge_to_data = network_utilities.get_nodes_and_edges_from_sif_file(
        network_file, store_edge_type=True, delim=delim, data_to_float=False)
    edge_to_weight = create_edge_score_file(edge_score_file, edges,
                                            edge_to_data, edge_score, delim)

    # Create node file (ignore seeds that are not in the network and assign non-seed scores)
    print "Creating node score file"
    node_score_file = scoring_folder + name + "node_scores.sif"  #seed_file.split("/")[-1] + ".converted"
    seed_score_file = scoring_folder + name + "seed_scores.sif"
    seeds, dummy, seed_to_data, dummy = network_utilities.get_nodes_and_edges_from_sif_file(
        seed_file, store_edge_type=False, delim=delim, data_to_float=True)
    if seed_to_data is None:
        seed_to_data = {}
        for seed in seeds:
            seed_to_data[seed] = seed_score
    node_to_data = create_node_score_file(node_score_file, seed_score_file,
                                          nodes, seeds, seed_to_data,
                                          non_seed_score, seed_score, delim)

    # Create background node file (selects k non-seeds randomly where k is the number of seeds)
    print "Creating background node score file"
    bg_node_file = scoring_folder + name + "node_scores_background.sif"  #seed_file.split("/")[-1] + ".converted"
    bg_seed_file = scoring_folder + name + "seed_scores_background.sif"
    create_background_score_file(bg_node_file, bg_seed_file, nodes, seeds,
                                 seed_to_data, non_seed_score, delim)

    # Create modified edge file using node scores for netshort
    print "Creating node score converted edge file (for netshort)"
    nd_edge_file = scoring_folder + name + "edge_scores_netshort.sif"  #network_file.split("/")[-1] + ".converted_for_netshort"
    create_node_score_converted_edge_score_file(nd_edge_file, edges,
                                                edge_to_weight, node_to_data,
                                                delim)

    # Create random network files for netzcore
    print "Creating random networks (for netzcore)"
    sampling_prefix = scoring_folder + "../" + "sampled_graph."
    if os.path.exists(sampling_prefix + "%s" % n_sample):
        print "\tSampled networks exists, skipping this step!"
    else:
        g = network_utilities.create_network_from_sif_file(
            network_file_in_sif=edge_score_file,
            use_edge_data=True,
            delim=delim)
        for i in xrange(1, n_sample + 1):
            g_sampled = network_utilities.randomize_graph(
                graph=g,
                randomization_type="preserve_topology_and_node_degree")
            network_utilities.output_network_in_sif(g_sampled,
                                                    sampling_prefix + "%s" % i)
    return
def main_pree():

    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--network_file')  #, required=True)
    parser.add_argument('-s', '--nodes_from')  #, required=True)
    parser.add_argument('-t', '--nodes_to')  #, required=True)
    parser.add_argument('-o', '--out_file')  #, required=True)
    parser.add_argument('-n', '--n_random', type=int, default=1000)
    parser.add_argument('-m', '--min_bin_size', type=int, default=100)
    parser.add_argument('-x', '--n_seed', type=int, default=452456)
    parser.add_argument('-f', '--parameter_file', type=str, default=None)
    parser.add_argument('-p',
                        '--parameter_file_prefix',
                        type=str,
                        default=None)
    parser.add_argument('-i',
                        '--parameter_file_start_index',
                        type=int,
                        default=None)
    parser.add_argument('-j',
                        '--parameter_file_end_index',
                        type=int,
                        default=None)
    args = parser.parse_args()
    # Run more than once for given input files
    if args.parameter_file_prefix is not None:
        parameter_file_prefix = args.parameter_file_prefix
        i_start = args.parameter_file_start_index
        i_end = args.parameter_file_end_index
        calculate_proximity_multiple(parameter_file_prefix, i_start, i_end)
        return
    # # Run from input parameter file
    # elif args.parameter_file_prefix is not None:
    #     network_file, nodes_from, nodes_to, out_file, min_bin_size, n_random, n_seed = get_parameters_from_file(
    #         args.parameter_file_prefix + "%s.txt" % 'n')
    # Run once with provided arguments
    else:
        nodes_from = args.nodes_from.split(",")
        nodes_to = args.nodes_to.split(",")
        network_file = args.network_file
        n_random = args.n_random
        min_bin_size = args.min_bin_size
        n_seed = args.n_seed
        out_file = args.out_file
        network = network_util.create_network_from_sif_file(
            network_file,
            use_edge_data=False,
            delim=None,
            include_unconnected=True)
    # print args
    print(network_file, nodes_from, nodes_to, n_random, min_bin_size, n_seed,
          out_file)
    values = wrappers.calculate_proximity(network,
                                          nodes_from=nodes_from,
                                          nodes_to=nodes_to,
                                          n_random=n_random,
                                          min_bin_size=min_bin_size,
                                          seed=n_seed)
    if values is not None:  # not in network
        d, z, (m, s) = values
        # print z, d, (m, s)
        open(out_file, 'w').write("%f %f %f %f\n" % (z, d, m, s))
    return
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--network_file')  #, required=True)
    parser.add_argument('-s', '--nodes_from')  #, required=True)
    parser.add_argument('-t', '--nodes_to')  #, required=True)
    parser.add_argument('-d', '--disease_mode')  # , required=True)
    parser.add_argument('-o', '--out_file')  #, required=True)
    parser.add_argument('-n', '--n_random', type=int, default=1000)
    parser.add_argument('-m', '--min_bin_size', type=int, default=100)
    parser.add_argument('-x', '--n_seed', type=int, default=452456)
    parser.add_argument('-f', '--parameter_file', type=str, default=None)
    parser.add_argument('-p',
                        '--parameter_file_prefix',
                        type=str,
                        default=None)
    parser.add_argument('-i',
                        '--parameter_file_start_index',
                        type=int,
                        default=None)
    parser.add_argument('-j',
                        '--parameter_file_end_index',
                        type=int,
                        default=None)
    args = parser.parse_args()
    # Run more than once for given input files

    network = network_util.create_network_from_sif_file(
        args.network_file,
        use_edge_data=False,
        delim=None,
        include_unconnected=True)
    wrappers.calculate_proximity_multiple(network,
                                          from_file=args.nodes_from,
                                          to_file=args.nodes_to,
                                          disease_mode=args.disease_mode,
                                          out_file=args.out_file)

    ###########################################

    # network_file = "../src_drug/Data/human_protein_interactome.sif"
    # nodes_from = "../src_drug/scratch/drug_target_interaction_temp_1.txt"
    # nodes_to = "../../ULK/result/GBM_ULK1_2_gene_score_by_RW_pvalue_FC_230119.tsv"
    # disease_name = "GBM"
    #
    # output_file = "../src_drug/Result/{}_drug_proximity_t_1_1.tsv".format(disease_name)
    #
    #
    # network = network_util.create_network_from_sif_file(network_file, use_edge_data=False, delim=None,
    #                                                     include_unconnected=True)
    # wrappers.calculate_proximity_multiple(network, from_file=nodes_from, to_file=nodes_to,
    #                                       disease_mode=disease_name, out_file=output_file)

    ###########################################
    # if args.parameter_file_prefix is not None:
    #     parameter_file_prefix = args.parameter_file_prefix
    #     i_start = args.parameter_file_start_index
    #     i_end = args.parameter_file_end_index
    #     calculate_proximity_multiple(parameter_file_prefix, i_start, i_end)
    #     return
    # # # Run from input parameter file
    # # elif args.parameter_file_prefix is not None:min_bin_size
    # #     network_file, nodes_from, nodes_to, out_file, , n_random, n_seed = get_parameters_from_file(
    # #         args.parameter_file_prefix + "%s.txt" % 'n')
    # # Run once with provided arguments
    # else:
    #     nodes_from = args.nodes_from.split(",")
    #     nodes_to = args.nodes_to.split(",")
    #     network_file = args.network_file
    #     n_random = args.n_random
    #     min_bin_size = args.min_bin_size
    #     n_seed = args.n_seed
    #     out_file = args.out_file
    # network = network_util.create_network_from_sif_file(network_file, use_edge_data=False, delim=None,
    #                                                              include_unconnected=True)
    # # print args
    # print(network_file, nodes_from, nodes_to, n_random, min_bin_size, n_seed, out_file)
    # values = wrappers.calculate_proximity(network, nodes_from=nodes_from, nodes_to=nodes_to, n_random=n_random,
    #                                       min_bin_size=min_bin_size, seed=n_seed)
    # if values is not None:  # not in network
    #     d, z, (m, s) = values
    #     # print z, d, (m, s)
    #     open(out_file, 'w').write("%f %f %f %f\n" % (z, d, m, s))
    return