def network_layout(gmt_fn, outfn=None): ## make a Graph object and write to gml for Gephi to ## do the layout d_gmt = read_gmt(gmt_fn) d_gmt_filt = {} for term, genes in d_gmt.items(): if len(genes) >= 5: d_gmt_filt[term] = genes d_gmt = d_gmt_filt print 'number of terms:', len(d_gmt) umls_ids_kept = d_gmt.keys() adj_matrix = jaccard_matrix(d_gmt) m = adj_matrix > 0.2 # degrees = adj_matrix.sum(axis=0) adj_matrix = adj_matrix * m.astype(int) G = nx.from_numpy_matrix(adj_matrix) print 'G: ',G.number_of_edges(), G.number_of_nodes() for i in range(adj_matrix.shape[0]): # G.node[i]['size'] = degrees[i] # G.node[i]['size'] = len(d_gmt[umls_ids_kept[i]]) G.node[i]['size'] = G.degree(i) G.node[i]['id'] = umls_ids_kept[i] if outfn is not None: nx.write_gml(G, outfn) return G
def network_layout(gmt_fn, outfn=None): ## make a Graph object and write to gml for Gephi to ## do the layout d_gmt = read_gmt(gmt_fn) d_gmt_filt = {} for term, genes in d_gmt.items(): if len(genes) >= 5: d_gmt_filt[term] = genes d_gmt = d_gmt_filt print 'number of terms:', len(d_gmt) umls_ids_kept = d_gmt.keys() adj_matrix = jaccard_matrix(d_gmt) m = adj_matrix > 0.2 # degrees = adj_matrix.sum(axis=0) adj_matrix = adj_matrix * m.astype(int) G = nx.from_numpy_matrix(adj_matrix) print 'G: ', G.number_of_edges(), G.number_of_nodes() for i in range(adj_matrix.shape[0]): # G.node[i]['size'] = degrees[i] # G.node[i]['size'] = len(d_gmt[umls_ids_kept[i]]) G.node[i]['size'] = G.degree(i) G.node[i]['id'] = umls_ids_kept[i] if outfn is not None: nx.write_gml(G, outfn) return G
def make_directed_json_graph_soc(gmt_fn, d_id_name, d_id_category, d_category_color, outfn=None): # make directed graph based on SOC - PT d_gmt = read_gmt(gmt_fn) d_gmt_filt = {} for term, genes in d_gmt.items(): if len(genes) >= 5: d_gmt_filt[term] = genes d_gmt = d_gmt_filt print 'number of terms:', len(d_gmt) umls_ids_kept = d_gmt.keys() adj_matrix = jaccard_matrix(d_gmt) m = adj_matrix > 0.2 adj_matrix = adj_matrix * m.astype(int) Gu = nx.from_numpy_matrix(adj_matrix) # undirected Graph, to get size G = nx.DiGraph() for i in range(len(umls_ids_kept)): umls_id = umls_ids_kept[i] name = d_id_name[umls_id] category = d_id_category[umls_id] color = d_category_color[category] G.add_edge('root', category) G.add_edge(category, umls_id) G.node[umls_id]['size'] = Gu.degree(i) G.node[umls_id]['label'] = name G.node[umls_id]['color'] = color print G.number_of_nodes(), G.number_of_edges() graph_data = json_graph.tree_data(G,root='root') json.dump(graph_data, open(outfn, 'wb')) return
def make_directed_json_graph(gmt_fn, d_id_name, d_id_category, d_category_color, outfn=None): # perform HC and make a directed graph and write to json # for pack visualization d_gmt = read_gmt(gmt_fn) d_gmt_filt = {} for term, genes in d_gmt.items(): if len(genes) >= 5: d_gmt_filt[term] = genes d_gmt = d_gmt_filt print 'number of terms:', len(d_gmt) umls_ids_kept = d_gmt.keys() adj_matrix = jaccard_matrix(d_gmt) hc = AgglomerativeClustering(n_clusters=10) hc.fit(adj_matrix) m = adj_matrix > 0.2 adj_matrix = adj_matrix * m.astype(int) Gu = nx.from_numpy_matrix(adj_matrix) # undirected Graph, to get size G = nx.DiGraph() print adj_matrix.shape, len(umls_ids_kept) for i in range(adj_matrix.shape[0]): cluster_label = hc.labels_[i] umls_id = umls_ids_kept[i] name = d_id_name[umls_id] G.add_edge('root', cluster_label) G.add_edge(cluster_label, umls_id) G.node[umls_id]['size'] = Gu.degree(i) G.node[umls_id]['label'] = name category = d_id_category[umls_id] color = d_category_color[category] G.node[umls_id]['color'] = color print G.number_of_nodes(), G.number_of_edges() graph_data = json_graph.tree_data(G, root='root') json.dump(graph_data, open(outfn, 'wb')) return
def make_directed_json_graph(gmt_fn, d_id_name, d_id_category, d_category_color, outfn=None): # perform HC and make a directed graph and write to json # for pack visualization d_gmt = read_gmt(gmt_fn) d_gmt_filt = {} for term, genes in d_gmt.items(): if len(genes) >= 5: d_gmt_filt[term] = genes d_gmt = d_gmt_filt print 'number of terms:', len(d_gmt) umls_ids_kept = d_gmt.keys() adj_matrix = jaccard_matrix(d_gmt) hc = AgglomerativeClustering(n_clusters=10) hc.fit(adj_matrix) m = adj_matrix > 0.2 adj_matrix = adj_matrix * m.astype(int) Gu = nx.from_numpy_matrix(adj_matrix) # undirected Graph, to get size G = nx.DiGraph() print adj_matrix.shape, len(umls_ids_kept) for i in range(adj_matrix.shape[0]): cluster_label = hc.labels_[i] umls_id = umls_ids_kept[i] name = d_id_name[umls_id] G.add_edge('root', cluster_label) G.add_edge(cluster_label, umls_id) G.node[umls_id]['size'] = Gu.degree(i) G.node[umls_id]['label'] = name category = d_id_category[umls_id] color = d_category_color[category] G.node[umls_id]['color'] = color print G.number_of_nodes(), G.number_of_edges() graph_data = json_graph.tree_data(G,root='root') json.dump(graph_data, open(outfn, 'wb')) return
def make_directed_json_graph_soc(gmt_fn, d_id_name, d_id_category, d_category_color, outfn=None): # make directed graph based on SOC - PT d_gmt = read_gmt(gmt_fn) d_gmt_filt = {} for term, genes in d_gmt.items(): if len(genes) >= 5: d_gmt_filt[term] = genes d_gmt = d_gmt_filt print 'number of terms:', len(d_gmt) umls_ids_kept = d_gmt.keys() adj_matrix = jaccard_matrix(d_gmt) m = adj_matrix > 0.2 adj_matrix = adj_matrix * m.astype(int) Gu = nx.from_numpy_matrix(adj_matrix) # undirected Graph, to get size G = nx.DiGraph() for i in range(len(umls_ids_kept)): umls_id = umls_ids_kept[i] name = d_id_name[umls_id] category = d_id_category[umls_id] color = d_category_color[category] G.add_edge('root', category) G.add_edge(category, umls_id) G.node[umls_id]['size'] = Gu.degree(i) G.node[umls_id]['label'] = name G.node[umls_id]['color'] = color print G.number_of_nodes(), G.number_of_edges() graph_data = json_graph.tree_data(G, root='root') json.dump(graph_data, open(outfn, 'wb')) return
## for side effects # GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/RF1000_GOtCS_AUC_0.7_proba_0.6_prediction_only.gmt' GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/ET100_GOtCS_AUC_0.76_proba_0.75.gmt' GML_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/side_effect_network.gml' ## for drugs # GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/RF1000_GOtCS_AUC_0.7_proba_0.6_prediction_only_flipped.gmt' # GML_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/drug_network.gml' CSV_FN = GML_FN.replace('.gml', '.csv') JSON_FN = CSV_FN.replace('.csv', '.json') ## retrieve meta data about SE d_umls_pt = mysqlTable2dict('sep', 'side_effects', 1, 2) d_pt_umls = mysqlTable2dict('sep', 'side_effects', 2, 1) d_soc_pt = read_gmt(HOME+'/Documents/bitbucket/pertid2trainingset/Y_matrix_no_mfc/SOC_to_pt.gmt') print len(d_soc_pt) d_umls_soc = {} for soc, pts in d_soc_pt.items(): for pt in pts: umls = d_pt_umls[pt] if umls is not None: if umls not in d_umls_soc: d_umls_soc[umls] = soc # else: # d_umls_soc[umls].append(soc) # one PT may have multiple SOCs # for pt in d_umls_soc: # if len(d_umls_soc[pt]) != 1: # print pt, d_umls_soc[pt]
## for side effects # GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/RF1000_GOtCS_AUC_0.7_proba_0.6_prediction_only.gmt' GMT_FN = HOME + '/Documents/Zichen_Projects/drug_se_prediction/ET100_GOtCS_AUC_0.76_proba_0.75.gmt' GML_FN = HOME + '/Documents/Zichen_Projects/drug_se_prediction/side_effect_network.gml' ## for drugs # GMT_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/RF1000_GOtCS_AUC_0.7_proba_0.6_prediction_only_flipped.gmt' # GML_FN = HOME+'/Documents/Zichen_Projects/drug_se_prediction/drug_network.gml' CSV_FN = GML_FN.replace('.gml', '.csv') JSON_FN = CSV_FN.replace('.csv', '.json') ## retrieve meta data about SE d_umls_pt = mysqlTable2dict('sep', 'side_effects', 1, 2) d_pt_umls = mysqlTable2dict('sep', 'side_effects', 2, 1) d_soc_pt = read_gmt( HOME + '/Documents/bitbucket/pertid2trainingset/Y_matrix_no_mfc/SOC_to_pt.gmt') print len(d_soc_pt) d_umls_soc = {} for soc, pts in d_soc_pt.items(): for pt in pts: umls = d_pt_umls[pt] if umls is not None: if umls not in d_umls_soc: d_umls_soc[umls] = soc # else: # d_umls_soc[umls].append(soc) # one PT may have multiple SOCs # for pt in d_umls_soc: # if len(d_umls_soc[pt]) != 1: # print pt, d_umls_soc[pt]