def calc_betweenness_centrality(G, cmap): # this function calculates the betweenness centrality of each node, and returns colors corresponding to these values local_BC = nx.betweenness_centrality(G) local_BC_scale = [ round(local_BC[key] * float(255)) for key in local_BC.keys() ] local_BC_scale = pd_Series(local_BC_scale, index=G.nodes()) rfrac = [cmap(int(x))[0] for x in local_BC_scale] gfrac = [cmap(int(x))[1] for x in local_BC_scale] bfrac = [cmap(int(x))[2] for x in local_BC_scale] rfrac = pd_Series(rfrac, index=G.nodes()) gfrac = pd_Series(gfrac, index=G.nodes()) bfrac = pd_Series(bfrac, index=G.nodes()) return rfrac, gfrac, bfrac
def calc_clustering_coefficient(G, cmap): # this function calculates the clustering coefficient of each node, and returns colors corresponding to these values local_CC = nx.clustering(G, G.nodes()) local_CC_scale = [ round(local_CC[key] * float(255)) for key in local_CC.keys() ] local_CC_scale = pd_Series(local_CC_scale, index=G.nodes()) rfrac = [cmap(int(x))[0] for x in local_CC_scale] gfrac = [cmap(int(x))[1] for x in local_CC_scale] bfrac = [cmap(int(x))[2] for x in local_CC_scale] rfrac = pd_Series(rfrac, index=G.nodes()) gfrac = pd_Series(gfrac, index=G.nodes()) bfrac = pd_Series(bfrac, index=G.nodes()) return rfrac, gfrac, bfrac
def updateScore(csvfile, score): """ Add or update score column and reorder """ import string head, rows = read_csv(csvfile) data = pd_read_csv(csvfile) data.index = data.index + 1 cols = data.columns.tolist() sco = pd_Series(np_zeros(len(data[cols[0]])), index=data.index) if 'Score' not in cols: data['Score'] = sco cols = ['Score'] + cols data = data[cols] colk = list(string.ascii_uppercase) for sc in score: try: coln = colk.index(sc[0]) val = sc[2] checked = sc[3] if checked: sco += val * data.iloc[:, coln] except: continue data['Score'] = sco data = data.sort_values('Score', ascending=False) updateMSA(os_path.dirname(csvfile), [[v] for v in data['Seq. ID']]) data = data.reset_index(drop=True) data.index = data.index + 1 data.rename_axis('Select', axis="columns") data.to_csv(csvfile, quoting=csv_QUOTE_ALL, index=False) return data
def translate_episode_data(episode_data): """ Convert episode data into data that can be used in a graph. Given data from multiple episodes make it such that it can be plotted by tsplot, i.e. the mean plus the confidence bounds. """ times, units, values = [], [], [] for index, (ep_len, ep_rew) in enumerate(episode_data): # Smooth out the data ep_rew = pd_Series(ep_rew).ewm(span=1000).mean() # sample for faster plotting x, y = sample(bins=np_linspace(0, MAX_TSTEPS, NSAMPLES + 1), time=ep_len, value=ep_rew) # Convert to tsplot format times.extend(x) values.extend(y) units.extend([index] * len(x)) return pd_DataFrame({ 'Frame': times, 'run_id': units, 'Average Episode Reward': values })
def calc_community_fraction(G, to_nodes, from_nodes, from_nodes_partition, color_list): # set color to most populous community degree = G.degree(to_nodes) rfrac, gfrac, bfrac = pd_Series(index=G.nodes()), pd_Series( index=G.nodes()), pd_Series(index=G.nodes()) for t in to_nodes: t_neighbors = G.neighbors(t) t_comms = [from_nodes_partition[i] for i in t_neighbors] t_comms = pd_Series(t_comms) unique_comms = t_comms.unique() num_unique_comms = len(unique_comms) num_n = pd_Series(index=unique_comms) for n in unique_comms: num_n[n] = sum(t_comms == n) # find max num_n color_max = color_list[num_n.argmax()][0:3] # how much is shared by other colors? #print(num_n) frac_shared = 1 - np.max(num_n) / np.sum(num_n) # darken the color by this amount #color_dark = shade_color(color_max,-frac_shared*100) color_dark = (color_max[0] * (1 - frac_shared), color_max[1] * (1 - frac_shared), color_max[2] * (1 - frac_shared)) rfrac[t] = color_dark[0] gfrac[t] = color_dark[1] bfrac[t] = color_dark[2] # fill in the from_nodes colors for f in from_nodes: f_group = from_nodes_partition[f] rfrac[f] = color_list[f_group][0] gfrac[f] = color_list[f_group][1] bfrac[f] = color_list[f_group][2] return rfrac, gfrac, bfrac
def save_gene_gene_json(input_file_name, out_file_start, cluster_id, color_type='clustering_coefficient', colormap='OrRd'): ''' this function takes a processed cluster file ('input_file_name': output of process clustering results in cluster_analysis_module) and saves a json file for every community in the file, starting with 'out_file_start'. 'input_file_name' and 'out_file_start' should be prepended with location. ''' # first load in a network (edge list) edge_list_df = pd_read_csv(input_file_name, sep='\t') group_ids = np.unique(edge_list_df['group_id']) for focal_group in group_ids: print focal_group save_file_name = out_file_start + '_' + str(int(focal_group)) + '.json' idx_group = (edge_list_df['group_id'] == focal_group) idx_group = list(edge_list_df['group_id'][idx_group].index) # make a network out of it #edge_list = [(edge_list_df['var1'][i], edge_list_df['var2'][i], np.abs(edge_list_df['corr'][i])) for i in idx_group if edge_list_df['corr'][i] !=0] edge_list = [(edge_list_df['var1'][i], edge_list_df['var2'][i], edge_list_df['corr'][i]) for i in idx_group if edge_list_df['corr'][i] != 0] Gsmall = nx.Graph() Gsmall.add_weighted_edges_from(edge_list) nodes = Gsmall.nodes() numnodes = len(nodes) edges = Gsmall.edges(data=True) numedges = len(edges) if color_type == 'community': partition = community.best_partition(Gsmall) partition = pd_Series(partition) col_temp = partition[Gsmall.nodes()] # Set up json for saving # what should the colors be?? num_communities = len(np.unique(col_temp)) color_list = plt.cm.gist_rainbow(np.linspace( 0, 1, num_communities)) # blend the community colors (so that to-nodes are a mixture of all the communities they belong to) rfrac, gfrac, bfrac = calc_community_fraction( Gsmall, Gsmall.nodes(), Gsmall.nodes(), partition, color_list) #nodes_dict = [{"id":n,"com":col_temp[n],"degree":author_gene_bp.degree(n)} for n in nodes] nodes_dict = [{ "id": n, "com": col_temp[n], "degree": Gsmall.degree(n), "rfrac": rfrac[n] * 255, "gfrac": gfrac[n] * 255, "bfrac": bfrac[n] * 255 } for n in nodes] elif color_type == 'clustering_coefficient': cmap = plt.get_cmap(colormap) rfrac, gfrac, bfrac = calc_clustering_coefficient(Gsmall, cmap) nodes_dict = [{ "id": n, "com": 0, "degree": Gsmall.degree(n), "rfrac": rfrac[n] * 255, "gfrac": gfrac[n] * 255, "bfrac": bfrac[n] * 255 } for n in nodes] elif color_type == 'betweenness_centrality': cmap = plt.get_cmap(colormap) rfrac, gfrac, bfrac = calc_betweenness_centrality(Gsmall, cmap) nodes_dict = [{ "id": n, "com": 0, "degree": Gsmall.degree(n), "rfrac": rfrac[n] * 255, "gfrac": gfrac[n] * 255, "bfrac": bfrac[n] * 255 } for n in nodes] # save network in json format #nodes = Gsmall.nodes() #numnodes = len(nodes) #edges=Gsmall.edges(data=True) #numedges = len(edges) #nodes_dict = [{"id":n,"com":col_temp[n],"degree":author_gene_bp.degree(n)} for n in nodes] #nodes_dict = [{"id":n,"com":col_temp[n],"degree":Gsmall.degree(n), # "rfrac":rfrac[n]*255,"gfrac":gfrac[n]*255,"bfrac":bfrac[n]*255} for n in nodes] node_map = dict( zip(nodes, range(numnodes))) # map to indices for source/target in edges edges_dict = [{ "source": node_map[edges[i][0]], "target": node_map[edges[i][1]], "weight": edges[i][2]['weight'] } for i in range(numedges)] import json json_graph = { "directed": False, "nodes": nodes_dict, "links": edges_dict } client = pymongo.MongoClient() db = client.cache heat_maps = db.heat_map_graph a = { 'clusterId': 'cluster' + str(int(focal_group)), 'heat_map': json.dumps(json_graph) #heat_map_ordered_transposed }
def analyze_AG_bipartite_network(genes, authors_GB_genes, pub_thresh=1, save_file_name="author_gene_bp.json", plot_flag=False): gene_list = genes.split(',') t0 = time.time() # unpickle groupby object #authors_GB_genes = pd.read_pickle(author_gene_GB_fname) authors_GB_genes = app.authors_GB_genes_loaded # get rid of invalid genes in gene_list new_gene_list = [] for gene in gene_list: if gene in authors_GB_genes: new_gene_list.append(gene) gene_list = new_gene_list # create list of all authors/weights who have published on at least one gene in gene_list AW_list_total = [] for gene in gene_list: AW_list_total.extend(list(authors_GB_genes[gene].index)) AW_list_total = zip(*AW_list_total) author_list_total = AW_list_total[0] weight_list_total = AW_list_total[1] print(time.time() - t0) author_list_total = pd_Series(author_list_total) weight_list_total = pd_Series(weight_list_total, index=author_list_total) # take the mean of duplicate entries df_temp = pd_DataFrame( { 'weight': list(weight_list_total), 'author': list(author_list_total) }, index=range(len(author_list_total))) AW_gb_temp = df_temp.weight.groupby(df_temp['author']).mean() author_list_total = list(AW_gb_temp.index) weight_list_total = list(AW_gb_temp.values) weight_list_total = pd_Series(weight_list_total, index=author_list_total) # make a dataframe, indexed by authors in author_list_total, with columns = entries in gene_list author_gene_df = pd_DataFrame(np.zeros( [len(author_list_total), len(gene_list)]), index=author_list_total, columns=gene_list) print(time.time() - t0) # fill in the dataframe for gene in gene_list: #print(gene) temp = list(authors_GB_genes[gene].index) temp = zip(*temp) authors_temp = list(np.unique(temp[0])) author_gene_df[gene][authors_temp] = weight_list_total[authors_temp] print(time.time() - t0) # add a column for total weight author_gene_df['total_weight'] = np.sum(np.array(author_gene_df), 1) author_gene_df.sort('total_weight', inplace=True, ascending=False) # next, convert this dataframe into bipartite network # make the small bipartite graph author_gene_bp = nx.Graph() # pick out authors which have published on > pub_thresh genes in gene_list index_temp = list(author_gene_df['total_weight'][ author_gene_df['total_weight'] > pub_thresh].index) # only allow 200 authors max if len(index_temp) > 200: author_nodes = index_temp[0:200] else: author_nodes = index_temp #index_temp = list(author_gene_df['total_num'].index) #author_nodes = index_temp[0:num_authors] print(time.time() - t0) for gene in gene_list: for author in author_nodes: # only add a link if connection exists if author_gene_df[gene][author] > 0: author_gene_bp.add_edge(gene, author) # add all genes in gene_list in case none of them come up author_gene_bp.add_nodes_from(gene_list) # now apply clustering algo to the bipartite graph partition = community.best_partition(author_gene_bp) partition = pd_Series(partition) col_temp_authors = partition[author_nodes] col_temp_genes = partition[gene_list] col_temp = partition[author_gene_bp.nodes()] if plot_flag: # plot graph if plot_flag = True plt.figure(figsize=[15, 15]) pos = nx.spring_layout(author_gene_bp, k=.3) #nx.draw(author_gene_bp,pos=pos,alpha=.5,node_size=100,node_color = col_temp,cmap='Paired') gene_list = list(gene_list) nx.draw_networkx_nodes(author_gene_bp, nodelist=author_nodes, node_color=col_temp_authors, cmap='Paired', pos=pos, alpha=.5, node_size=100) nx.draw_networkx_nodes(author_gene_bp, nodelist=gene_list, node_color=col_temp_genes, cmap='Paired', pos=pos, alpha=.5, node_size=200, node_shape='s') nx.draw_networkx_edges(author_gene_bp, pos=pos, alpha=.1) node_subset_dict = dict(zip(index_temp[0:20], index_temp[0:20])) gene_subset_dict = dict(zip(gene_list, gene_list)) temp = node_subset_dict.update(gene_subset_dict) nx.draw_networkx_labels(author_gene_bp, pos=pos, labels=node_subset_dict) # Set up json for saving # what should the colors be?? num_communities = len(np.unique(col_temp)) color_list = plt.cm.gist_rainbow(np.linspace(0, 1, num_communities)) # blend the community colors (so that to-nodes are a mixture of all the communities they belong to) rfrac, gfrac, bfrac = calc_community_fraction(author_gene_bp, author_nodes, gene_list, partition, color_list) # save network in json format nodes = author_gene_bp.nodes() numnodes = len(nodes) edges = author_gene_bp.edges() numedges = len(edges) #nodes_dict = [{"id":n,"com":col_temp[n],"degree":author_gene_bp.degree(n)} for n in nodes] nodes_dict = [{ "id": n, "com": col_temp[n], "degree": author_gene_bp.degree(n), "rfrac": rfrac[n] * 255, "gfrac": gfrac[n] * 255, "bfrac": bfrac[n] * 255 } for n in nodes] node_map = dict(zip( nodes, range(numnodes))) # map to indices for source/target in edges edges_dict = [{ "source": node_map[edges[i][0]], "target": node_map[edges[i][1]] } for i in range(numedges)] #import json json_graph = {"directed": False, "nodes": nodes_dict, "links": edges_dict} #json.dump(json_graph,open(save_file_name,'w')) print(time.time() - t0) return json_graph
def get_most_similar_coordinates(word, count=10, pos='any', remove_stop=True): ## word = word.lower() word_vectors = book2vec.wv try: word_seq = word_vectors.most_similar(word, topn=len(word_vectors.vocab)) sim_words = [x[0] for x in word_seq] if (remove_stop): #from nltk.corpus import stopwords #nltk.download("stopwords") #stop_words = set(stopwords.words('english')) stop_words = { 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves' } without_stop = [x for x in sim_words if x not in stop_words] sim_words = without_stop slice2 = points if pos != 'any': slice2 = points.loc[points['attr'].isin([pos])] attr_words = slice2['word'].tolist() res_words = [x for x in sim_words if x in attr_words] res_words = res_words[:count] slice = points.loc[points['word'].isin(res_words)] slice = slice.append(points.loc[points['word'] == word]) slice['sim_score'] = pd_Series(1.0, index=slice.index) slice['word_count'] = pd_Series(0, index=slice.index) for i, point in slice.iterrows(): slice.at[i, 'word_count'] = get_word_count(point.word, word_vectors) for (w, s) in word_seq: if w == point.word: slice.at[i, 'sim_score'] = round(s, 3) break retVal = slice except: retVal = None return retVal