Example #1
0
def network_layout(gmt_fn, outfn=None):
	## make a Graph object and write to gml for Gephi to 
	## do the layout

	d_gmt = read_gmt(gmt_fn)
	d_gmt_filt = {}
	for term, genes in d_gmt.items():
		if len(genes) >= 5:
			d_gmt_filt[term] = genes
	d_gmt = d_gmt_filt

	print 'number of terms:', len(d_gmt)
	umls_ids_kept = d_gmt.keys()
	adj_matrix = jaccard_matrix(d_gmt)

	m = adj_matrix > 0.2
	# degrees = adj_matrix.sum(axis=0)
	adj_matrix = adj_matrix * m.astype(int)
	
	G = nx.from_numpy_matrix(adj_matrix)

	print 'G: ',G.number_of_edges(), G.number_of_nodes()

	for i in range(adj_matrix.shape[0]):
		# G.node[i]['size'] = degrees[i]
		# G.node[i]['size'] = len(d_gmt[umls_ids_kept[i]])
		G.node[i]['size'] = G.degree(i)
		G.node[i]['id'] = umls_ids_kept[i]

	if outfn is not None:	
		nx.write_gml(G, outfn)
	return G
Example #2
0
def network_layout(gmt_fn, outfn=None):
    ## make a Graph object and write to gml for Gephi to
    ## do the layout

    d_gmt = read_gmt(gmt_fn)
    d_gmt_filt = {}
    for term, genes in d_gmt.items():
        if len(genes) >= 5:
            d_gmt_filt[term] = genes
    d_gmt = d_gmt_filt

    print 'number of terms:', len(d_gmt)
    umls_ids_kept = d_gmt.keys()
    adj_matrix = jaccard_matrix(d_gmt)

    m = adj_matrix > 0.2
    # degrees = adj_matrix.sum(axis=0)
    adj_matrix = adj_matrix * m.astype(int)

    G = nx.from_numpy_matrix(adj_matrix)

    print 'G: ', G.number_of_edges(), G.number_of_nodes()

    for i in range(adj_matrix.shape[0]):
        # G.node[i]['size'] = degrees[i]
        # G.node[i]['size'] = len(d_gmt[umls_ids_kept[i]])
        G.node[i]['size'] = G.degree(i)
        G.node[i]['id'] = umls_ids_kept[i]

    if outfn is not None:
        nx.write_gml(G, outfn)
    return G
Example #3
0
def make_directed_json_graph_soc(gmt_fn, d_id_name, d_id_category, d_category_color, outfn=None):
	# make directed graph based on SOC - PT
	d_gmt = read_gmt(gmt_fn)
	d_gmt_filt = {}
	for term, genes in d_gmt.items():
		if len(genes) >= 5:
			d_gmt_filt[term] = genes
	d_gmt = d_gmt_filt

	print 'number of terms:', len(d_gmt)
	umls_ids_kept = d_gmt.keys()
	adj_matrix = jaccard_matrix(d_gmt)
	m = adj_matrix > 0.2
	adj_matrix = adj_matrix * m.astype(int)
	Gu = nx.from_numpy_matrix(adj_matrix) # undirected Graph, to get size
	G = nx.DiGraph()
	for i in range(len(umls_ids_kept)):
		umls_id = umls_ids_kept[i]
		name = d_id_name[umls_id]
		category = d_id_category[umls_id]
		color = d_category_color[category]

		G.add_edge('root', category)
		G.add_edge(category, umls_id)

		G.node[umls_id]['size'] = Gu.degree(i)
		G.node[umls_id]['label'] = name
		G.node[umls_id]['color'] = color
	print G.number_of_nodes(), G.number_of_edges()		
	graph_data = json_graph.tree_data(G,root='root')
	json.dump(graph_data, open(outfn, 'wb'))
	return
Example #4
0
def make_directed_json_graph(gmt_fn,
                             d_id_name,
                             d_id_category,
                             d_category_color,
                             outfn=None):
    # perform HC and make a directed graph and write to json
    # for pack visualization
    d_gmt = read_gmt(gmt_fn)
    d_gmt_filt = {}
    for term, genes in d_gmt.items():
        if len(genes) >= 5:
            d_gmt_filt[term] = genes
    d_gmt = d_gmt_filt

    print 'number of terms:', len(d_gmt)
    umls_ids_kept = d_gmt.keys()
    adj_matrix = jaccard_matrix(d_gmt)

    hc = AgglomerativeClustering(n_clusters=10)
    hc.fit(adj_matrix)

    m = adj_matrix > 0.2
    adj_matrix = adj_matrix * m.astype(int)
    Gu = nx.from_numpy_matrix(adj_matrix)  # undirected Graph, to get size

    G = nx.DiGraph()
    print adj_matrix.shape, len(umls_ids_kept)
    for i in range(adj_matrix.shape[0]):
        cluster_label = hc.labels_[i]
        umls_id = umls_ids_kept[i]
        name = d_id_name[umls_id]
        G.add_edge('root', cluster_label)
        G.add_edge(cluster_label, umls_id)
        G.node[umls_id]['size'] = Gu.degree(i)
        G.node[umls_id]['label'] = name

        category = d_id_category[umls_id]
        color = d_category_color[category]
        G.node[umls_id]['color'] = color
    print G.number_of_nodes(), G.number_of_edges()
    graph_data = json_graph.tree_data(G, root='root')
    json.dump(graph_data, open(outfn, 'wb'))
    return
Example #5
0
def make_directed_json_graph(gmt_fn, d_id_name, d_id_category, d_category_color, outfn=None):
	# perform HC and make a directed graph and write to json
	# for pack visualization
	d_gmt = read_gmt(gmt_fn)
	d_gmt_filt = {}
	for term, genes in d_gmt.items():
		if len(genes) >= 5:
			d_gmt_filt[term] = genes
	d_gmt = d_gmt_filt

	print 'number of terms:', len(d_gmt)
	umls_ids_kept = d_gmt.keys()
	adj_matrix = jaccard_matrix(d_gmt)

	hc = AgglomerativeClustering(n_clusters=10)
	hc.fit(adj_matrix)

	m = adj_matrix > 0.2
	adj_matrix = adj_matrix * m.astype(int)
	Gu = nx.from_numpy_matrix(adj_matrix) # undirected Graph, to get size

	G = nx.DiGraph()
	print adj_matrix.shape, len(umls_ids_kept)
	for i in range(adj_matrix.shape[0]):
		cluster_label = hc.labels_[i]
		umls_id = umls_ids_kept[i]
		name = d_id_name[umls_id]
		G.add_edge('root', cluster_label)
		G.add_edge(cluster_label, umls_id)
		G.node[umls_id]['size'] = Gu.degree(i)
		G.node[umls_id]['label'] = name

		category = d_id_category[umls_id]
		color = d_category_color[category]
		G.node[umls_id]['color'] = color
	print G.number_of_nodes(), G.number_of_edges()	
	graph_data = json_graph.tree_data(G,root='root')
	json.dump(graph_data, open(outfn, 'wb'))
	return
Example #6
0
def make_directed_json_graph_soc(gmt_fn,
                                 d_id_name,
                                 d_id_category,
                                 d_category_color,
                                 outfn=None):
    # make directed graph based on SOC - PT
    d_gmt = read_gmt(gmt_fn)
    d_gmt_filt = {}
    for term, genes in d_gmt.items():
        if len(genes) >= 5:
            d_gmt_filt[term] = genes
    d_gmt = d_gmt_filt

    print 'number of terms:', len(d_gmt)
    umls_ids_kept = d_gmt.keys()
    adj_matrix = jaccard_matrix(d_gmt)
    m = adj_matrix > 0.2
    adj_matrix = adj_matrix * m.astype(int)
    Gu = nx.from_numpy_matrix(adj_matrix)  # undirected Graph, to get size
    G = nx.DiGraph()
    for i in range(len(umls_ids_kept)):
        umls_id = umls_ids_kept[i]
        name = d_id_name[umls_id]
        category = d_id_category[umls_id]
        color = d_category_color[category]

        G.add_edge('root', category)
        G.add_edge(category, umls_id)

        G.node[umls_id]['size'] = Gu.degree(i)
        G.node[umls_id]['label'] = name
        G.node[umls_id]['color'] = color
    print G.number_of_nodes(), G.number_of_edges()
    graph_data = json_graph.tree_data(G, root='root')
    json.dump(graph_data, open(outfn, 'wb'))
    return
Example #7
0
## for side effects
# GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/RF1000_GOtCS_AUC_0.7_proba_0.6_prediction_only.gmt'
GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/ET100_GOtCS_AUC_0.76_proba_0.75.gmt' 
GML_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/side_effect_network.gml'
## for drugs
# GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/RF1000_GOtCS_AUC_0.7_proba_0.6_prediction_only_flipped.gmt'
# GML_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/drug_network.gml'

CSV_FN = GML_FN.replace('.gml', '.csv')
JSON_FN = CSV_FN.replace('.csv', '.json')

## retrieve meta data about SE
d_umls_pt = mysqlTable2dict('sep', 'side_effects', 1, 2)
d_pt_umls = mysqlTable2dict('sep', 'side_effects', 2, 1)
d_soc_pt = read_gmt(HOME+'/Documents/bitbucket/pertid2trainingset/Y_matrix_no_mfc/SOC_to_pt.gmt')
print len(d_soc_pt)
d_umls_soc = {}
for soc, pts in d_soc_pt.items():
	for pt in pts:
		umls = d_pt_umls[pt]
		if umls is not None:
			if umls not in d_umls_soc:
				d_umls_soc[umls] = soc
			# else:
			# 	d_umls_soc[umls].append(soc) # one PT may have multiple SOCs

# for pt in d_umls_soc:
# 	if len(d_umls_soc[pt]) != 1:
# 		print pt, d_umls_soc[pt]
Example #8
0
## for side effects
# GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/RF1000_GOtCS_AUC_0.7_proba_0.6_prediction_only.gmt'
GMT_FN = HOME + '/Documents/Zichen_Projects/drug_se_prediction/ET100_GOtCS_AUC_0.76_proba_0.75.gmt'
GML_FN = HOME + '/Documents/Zichen_Projects/drug_se_prediction/side_effect_network.gml'
## for drugs
# GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/RF1000_GOtCS_AUC_0.7_proba_0.6_prediction_only_flipped.gmt'
# GML_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/drug_network.gml'

CSV_FN = GML_FN.replace('.gml', '.csv')
JSON_FN = CSV_FN.replace('.csv', '.json')

## retrieve meta data about SE
d_umls_pt = mysqlTable2dict('sep', 'side_effects', 1, 2)
d_pt_umls = mysqlTable2dict('sep', 'side_effects', 2, 1)
d_soc_pt = read_gmt(
    HOME +
    '/Documents/bitbucket/pertid2trainingset/Y_matrix_no_mfc/SOC_to_pt.gmt')
print len(d_soc_pt)
d_umls_soc = {}
for soc, pts in d_soc_pt.items():
    for pt in pts:
        umls = d_pt_umls[pt]
        if umls is not None:
            if umls not in d_umls_soc:
                d_umls_soc[umls] = soc
            # else:
            # 	d_umls_soc[umls].append(soc) # one PT may have multiple SOCs

# for pt in d_umls_soc:
# 	if len(d_umls_soc[pt]) != 1:
# 		print pt, d_umls_soc[pt]