def answer_four(): import pandas as pd G1 = answer_three() G2 = nx.read_edgelist('Employee_Relationships.txt', delimiter=' ', data=[('Score', int)]) df1 = nx.to_pandas_dataframe(G1) df2 = nx.to_pandas_dataframe(G2, weight='Score') data = pd.DataFrame( columns=['E1', 'E2', 'Relationship_Score', 'Nb_Common_Movies' ]).set_index(['E1', 'E2']) employees_list = list(employees) for i in range(0, len(employees_list)): for j in range(i + 1, len(employees_list)): data.loc[(employees_list[i], employees_list[j]), 'Nb_Common_Movies'] = df1.loc[employees_list[i], employees_list[j]] data.loc[(employees_list[i], employees_list[j]), 'Relationship_Score'] = df2.loc[employees_list[i], employees_list[j]] data['Nb_Common_Movies'] = data['Nb_Common_Movies'].astype(float) data['Relationship_Score'] = data['Relationship_Score'].astype(float) return data['Relationship_Score'].corr(data['Nb_Common_Movies'], method='pearson')
def prune_homology_graph(df, chim_dir): to_remove = [] df['brk_left_cut'] = df['name'].str.split(":").str[0:3].str.join(sep=":") df['brk_right_cut'] = df['name'].str.split(":").str[3:6].str.join(sep=":") left_nodes = set(df[df['brk_left_cut'].duplicated()]['brk_left_cut']) right_nodes = df[df['brk_right_cut'].duplicated()]['brk_right_cut'] all_nodes = list(zip(left_nodes, itertools.repeat("left"))) + list( zip(right_nodes, itertools.repeat("right"))) for node, hom_side in all_nodes: node_members = df[((df['brk_' + hom_side + '_cut'] == node))]['name'] node_graph = nx.Graph() node_graph.add_nodes_from(node_members, exprs=10) for jxn1, jxn2 in itertools.combinations(node_members, 2): pair_score = get_pairwise_hom(jxn1, jxn2, chim_dir, hom_side) if pair_score != 0: node_graph.add_edge(jxn1, jxn2, weight=pair_score) # nx.draw_networkx(node_graph, pos=nx.shell_layout(node_graph), node_size=100) # plt.show() adj_mat = nx.to_pandas_dataframe(node_graph) node_compare = adj_mat[adj_mat.sum() > 0].index.tolist() if len(node_compare) > 0: node_homdf = df[df['name'].isin(node_compare)][[ 'name', 'TPM_Fusion', 'TPM_Left', 'TPM_Right' ]].set_index('name') node_homdf['max_pairs'] = node_homdf[['TPM_Left', 'TPM_Right']].max(axis=1) node_homdf = node_homdf.sort_values(['TPM_Fusion', 'max_pairs'], ascending=False) node_remove = node_homdf.iloc[1:].index.tolist() to_remove.extend(node_remove) # use list of to_remove to mark homologous fusions return to_remove
def answer_four(): # Your Code Here # load graph G = answer_three() # extract employee edge weights from graph into dataframe employee_weights = nx.to_pandas_dataframe(G) # load the text file into a dataframe df = pd.read_csv('Employee_Relationships.txt', sep='\t', header = None) # add column names to the dataframe df.columns = ['Employee_1', 'Employee_2','Frienship_score'] # add dummy column for movie weight for movies in common df['Movie_score'] = 0 # add employee weights to the friendship score dataframe df['Movie_score'] = employee_weights.lookup(df['Employee_1'], df['Employee_2']) # calcualte the Pearson correlation Pearson_corr = df['Frienship_score'].corr(df['Movie_score']) return Pearson_corr # Your Answer Here
def maximum_spanning_tree(table, undirected=False): sys.stderr.write("Calculating MST score...\n") table = table.copy() table["distance"] = 1.0 / table["nij"] G = nx.from_pandas_dataframe(table, source="src", target="trg", edge_attr=["distance", "nij"]) T = nx.minimum_spanning_tree(G, weight="distance") table2 = pd.melt(nx.to_pandas_dataframe(T, weight="nij").reset_index(), id_vars="index") table2 = table2[table2["value"] > 0] table2.rename(columns={ "index": "src", "variable": "trg", "value": "cij" }, inplace=True) table2["score"] = table2["cij"] table = table.merge(table2, on=["src", "trg"]) if undirected: table["edge"] = table.apply( lambda x: "%s-%s" % (min(x["src"], x["trg"]), max(x["src"], x["trg"])), axis=1) table = table.drop_duplicates(subset=["edge"]) table = table.drop("edge", 1) return table[["src", "trg", "nij", "score"]]
def graph_to_pandas_time_series(graph): """ Transform a graph into a pandas time series dataframe. """ time_dataframe = nx.to_pandas_dataframe(graph) return
def CalcularAdyacencia (Grafo, fileName): pathFile = os.getcwd() + "\\" + fileName if path.exists(pathFile): os.remove(pathFile) df = nx.to_pandas_dataframe(Grafo) df.to_csv(pathFile, header=None, index=None, mode='w', sep='\t')
def test_from_adjacency(self): nodelist = [1, 2] dftrue = pd.DataFrame([[1, 1], [1, 0]], dtype=int, index=nodelist, columns=nodelist) G = nx.Graph([(1, 1), (1, 2)]) df = nx.to_pandas_adjacency(G, dtype=int) pd.testing.assert_frame_equal(df, dftrue) # deprecated df = nx.to_pandas_dataframe(G, dtype=int) pd.testing.assert_frame_equal(df, dftrue)
def A(self, t, dtype="numpy"): nodelist = self.get_living(t, indices_only=True) if dtype == "sparse": return nx.adjacency_matrix(self, nodelist) elif dtype == "numpy": return np.array(nx.to_numpy_matrix(self, nodelist)) elif dtype == "pandas": return nx.to_pandas_dataframe(self, nodelist) else: raise ValueError("Unknown dtype")
def answer_four(): df = nx.to_pandas_dataframe(answer_three()) dfl = [] for i in range(len(rdata)): dfl.append(df.loc[rdata.iloc[i][0]][rdata.iloc[i][1]]) rdata[3] = dfl return (rdata.corr()).iloc[0][3]
def answer_four(): df = pd.read_table("Employee_Relationships.txt",header=None) df.columns = ["one","two","relation"] g = answer_two() P = answer_three() df1 = nx.to_pandas_dataframe(P) df["common"] = df.apply(lambda x: df1.loc[x[0],x[1]],axis=1) correlation = df.corr(method='pearson') return correlation.iloc[0,1]
def add_edge_table(self, weight="weight"): adj_df = nx.to_pandas_dataframe(self.network, weight=weight, nonedge=np.nan) edge_table = melt_upper_triu(adj_df) edge_table = edge_table.loc[pd.notnull(edge_table)].reset_index() edge_table.columns = ['Gene1', 'Gene2', weight] edge_table[['Gene1', 'Gene2']] = (edge_table[[ 'Gene1', 'Gene2' ]].applymap(lambda x: self.node_2_name.get(x, x))) return edge_table
def doubly_stochastic(table, undirected=False, return_self_loops=False): sys.stderr.write("Calculating DST score...\n") table = table.copy() table2 = table.copy() original_nodes = len(set(table["src"]) | set(table["trg"])) table = pd.pivot_table(table, values="nij", index="src", columns="trg", aggfunc="sum", fill_value=0) row_sums = table.sum(axis=1) attempts = 0 while np.std(row_sums) > 1e-12: table = table.div(row_sums, axis=0) col_sums = table.sum(axis=0) table = table.div(col_sums, axis=1) row_sums = table.sum(axis=1) attempts += 1 if attempts > 1000: warnings.warn( "Matrix could not be reduced to doubly stochastic. See Sec. 3 of Sinkhorn 1964", RuntimeWarning) return pd.DataFrame() table = pd.melt(table.reset_index(), id_vars="src") table = table[table["src"] < table["trg"]] table = table[table["value"] > 0].sort_values(by="value", ascending=False) i = 0 G = nx.Graph() while nx.number_connected_components(G) != 1 or nx.number_of_nodes( G) < original_nodes: edge = table.iloc[i] G.add_edge(edge["src"], edge["trg"], weight=edge["value"]) i += 1 table = pd.melt(nx.to_pandas_dataframe(G).reset_index(), id_vars="index") table = table[table["value"] > 0] table.rename(columns={ "index": "src", "variable": "trg", "value": "cij" }, inplace=True) table["score"] = table["cij"] table = table.merge(table2[["src", "trg", "nij"]], on=["src", "trg"]) if not return_self_loops: table = table[table["src"] != table["trg"]] if undirected: table = table[table["src"] <= table["trg"]] return table[["src", "trg", "nij", "score"]]
def airport_log_flow(data): import networkx as nx import seaborn as sb matrix=data[["log_PAX","Departure","Arrival"]] group=matrix.groupby(['Departure', 'Arrival'],as_index=False).mean() G=nx.Graph() for i in range(126): G.add_edge(group["Departure"][i],group["Arrival"][i],weight=group["log_PAX"][i]) adjacency_matrix=nx.to_pandas_dataframe(G) plt.figure(figsize=(15,15)) sb.heatmap(adjacency_matrix,cmap="OrRd")
def airport_log_flow(data): import networkx as nx import seaborn as sb matrix = data[["log_PAX", "Departure", "Arrival"]] group = matrix.groupby(['Departure', 'Arrival'], as_index=False).mean() G = nx.Graph() for i in range(126): G.add_edge(group["Departure"][i], group["Arrival"][i], weight=group["log_PAX"][i]) adjacency_matrix = nx.to_pandas_dataframe(G) plt.figure(figsize=(15, 15)) sb.heatmap(adjacency_matrix, cmap="OrRd")
def answer_four(): G = pd.read_csv('Employee_Movie_Choices.txt', sep = '\t', skiprows = 1, names = ['Employee', 'Movie']) G1 = nx.from_pandas_dataframe(G, 'Employee', 'Movie') G1.add_nodes_from(employees, type = 'employee', bipartite = 0) G1.add_nodes_from(movies, type = 'movie', bipartite = 1) X = set(employees) P = bipartite.weighted_projected_graph(G1, X) G2 = nx.to_pandas_dataframe(P) H = pd.read_csv('Employee_Relationships.txt', sep = '\t', header = None, names = ['Employee', 'Workmate', 'Relation_rating']) H['Shared_movies'] = G2.lookup(H['Employee'], H['Workmate']) return H['Relation_rating'].corr(H['Shared_movies'])
def q4(P: nx.Graph): ''' P: weighted projection graph which tells us how many movies different pairs of employees have in common. ''' rel_df = pd.read_csv('Employee_Relationships.txt', delim_whitespace=True, header=None, names=['n1', 'n2', 'weight']) emp_df = nx.to_pandas_dataframe(P).unstack().reset_index().query( 'level_0 != level_1').reset_index(drop=True) df = pd.merge(rel_df, emp_df, how='left', right_on=['level_0', 'level_1'], left_on=['n1', 'n2']).loc[:, ['weight', 0]] df.columns = ['relationship_score', 'num_movies_common'] return df.corr().iloc[0, 1]
def blockmodel_output(G, t=1.15): # Makes life easier to have consecutively labeled integer nodes H = nx.convert_node_labels_to_integers(G, label_attribute='label') """Creates hierarchical cluster of graph G from distance matrix""" # Create distance matrix path_length = dict(nx.all_pairs_shortest_path_length(H)) distances = np.zeros((len(H), len(H))) for u, p in path_length.items(): for v, d in p.items(): distances[u][v] = d # Create hierarchical cluster Y = distance.squareform(distances) Z = hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrative purposes membership = list(hierarchy.fcluster(Z, t=t)) # Create collection of lists for blockmodel partitions = defaultdict(list) for n, p in zip(list(range(len(G))), membership): partitions[p].append(n) # Build blockmodel graph #BM = nx.blockmodel(H, partitions) # change in nx 2.0 p_values = list(partitions.values()) BM = nx.quotient_graph(H, p_values, relabel=True) label_dict = dict([(n, H.node[n]['label']) for n in H]) order = [label_dict[item] for sublist in p_values for item in sublist] nm = nx.to_pandas_dataframe(G) nm = nm.reindex(index=order) nm.columns = nm.index ho = homophily(G, 'type') output = { 'G': G, 'H': H, 'partitions': partitions, 'BM': BM, 'nm': nm, 'label_dict': label_dict, 'order': order, 'distances': distances } output.update(ho) return output
def answer_four(): # Your Code Here df = pd.read_csv('Employee_Relationships.txt', delimiter="\t", header=None) df.rename(columns={ 0: 'employee1', 1: 'employee2', 2: 'score' }, inplace=True) #df['movies_in_common'] = None #df_2 = pd.read_csv('Employee_Movie_Choices.txt', delimiter = "\t") df_1 = nx.to_pandas_dataframe(answer_three()) df['shared_movies'] = df_1.lookup(df['employee1'], df['employee2']) #df_3 = df_2.groupby('#Employee')['Movie'].apply(list) #df_3 = df_3.reset_index() #my_list = [] #for x in df_3["Movie"]: #for y in df_3["Movie"]: #if x != y: #my_list.append(len(set(x) & set(y))) #final = pd.merge(df,df_3,on='employee1') corr_P = df['score'].corr(df['shared_movies'], method='pearson') return corr_P #final#.agg({"review_scores_value":np.average})# Your Answer Here
def network_to_pandas(self): """ Returns network in pandas format """ return nx.to_pandas_dataframe(self.g)
import networkx as nx import pandas as pd from networkx.algorithms import bipartite # df = pd.read_excel("FKT Data/fkt_cooccurrence.xlsx") # G = nx.from_pandas_dataframe(df, 'Booking_Service_Id', 'Name_Product') # W = bipartite.weighted_projected_graph(G, df['Name_Product'].unique()) # X = nx.to_pandas_dataframe(W) df = pd.read_excel("co-occurrence.xlsx") df = df[~df.CATEGORY_PRODUCT.isnull()] G = nx.from_pandas_dataframe(df, 'BOOKING_ID', 'CATEGORY_PRODUCT') W = bipartite.weighted_projected_graph(G, df['CATEGORY_PRODUCT'].unique()) X = nx.to_pandas_dataframe(W) X.to_excel("product_pairing.xlsx")
def graph_centrality(df, cent_type='betweenness', keep_thresh=0.5, cond_type='add_one', corr_type='spearman', weighted=False, corr_dir='none'): ''' :param df: @type pandas DataFrame :param cent_type: @type string - valid values: betweenness, degree, closeness, eigenvector :param keep_thresh: @type float - default 0.5 :param cond_type: @type: string - valid values: add_one, hellinger :param corr_type: @type: string - valid values: spearman, kendall, pearson, MIC :param weighted: @type: boolean - True if you want to produce a graph with weighted edges, False otherwise :param corr_dir: @type: string - valid values: none, positive, negative :return: ''' data = df.copy() conditioned_df = condition(data, cond_type) # condition data w_corr_df = find_correlation(conditioned_df, corr_type) if corr_dir == 'positive': w_corr_df_b = 1 - w_corr_df.copy( ) #only keep strong positive correlations (small positive numbers) elif corr_dir == 'negative': w_corr_df_b = 1 + w_corr_df.copy( ) # only keep strong negative correlations (small negative numbers) else: w_corr_df_b = 1 - abs(w_corr_df.copy( )) # keep both strong positive and negative correlations w_corr_df_b[( w_corr_df_b >= 1 - keep_thresh )] = 1 # set anything greater than the threshold value to 1 so we can remove it. labels = list(w_corr_df_b.index) temp = abs(w_corr_df_b.copy()) temp.insert(0, 'var1', labels) if weighted == True: attr = 'weight' else: attr = 'edge' df_b = pd.melt(temp, 'var1', var_name='var2', value_name=attr) df_b = df_b.loc[( (df_b[attr] <= 1 - keep_thresh) & (df_b[attr] > 0.0)), :] # take only those edge pairs that made the cut df_g = networkx.from_pandas_dataframe(df_b, 'var1', 'var2', attr) # takes a list of valid edges networkx.write_graphml(df_g, 'graph.graphml') #networkx.draw(df_g, with_labels=True) #networkx.draw(df_g) #pylab.show() #print('adjacency matrix', networkx.to_pandas_dataframe(df_g)) am = networkx.to_pandas_dataframe(df_g) am.to_csv('adj_matrix.csv') if cent_type == 'betweenness': centrality = networkx.betweenness_centrality(df_g) elif cent_type == 'degree': centrality = networkx.degree_centrality(df_g) elif cent_type == 'closeness': centrality = networkx.closeness_centrality(df_g) elif cent_type == 'eigenvector': centrality = networkx.eigenvector_centrality(df_g) else: print('error, unknown centrality') return -1 centrality_df = pd.DataFrame.from_dict(centrality, orient='index') centrality_df.columns = ['metric'] if not centrality_df.empty: centrality_df = centrality_df[centrality_df.ix[:, 0] > 0] if not centrality_df.empty: centrality_df.sort_values('metric', axis=0, ascending=False, inplace=True) '''fig = plt.figure() plt.hist(centrality_df, bins=20) plt.xlabel('Centrality') plt.ylabel('Frequency') plt.title('Graph Centrality Distribution') plt.tight_layout() #fig.savefig('test.jpg') plt.show()''' return centrality_df
elif edgeMat.columns[i] in G4: groundTruth[i] = 3 elif edgeMat.columns[i] in G5: groundTruth[i] = 4 elif edgeMat.columns[i] in G6: groundTruth[i] = 5 else: groundTruth[i] = 6 for kClusters in range(8): clusteringAlgorithms(kClusters+4,groundTruth,edgeMat,pos,FG) ''' #HERE ending comments #Clustering for the cleared FG kClusters = 0 edgeMat = nx.to_pandas_dataframe(FGcleared) groundTruth = edgeMat.values.tolist() pos = nx.spring_layout(FGcleared) for i in range(len(groundTruth)): if edgeMat.columns[i] in G1: groundTruth[i] = 0 elif edgeMat.columns[i] in G2: groundTruth[i] = 1 elif edgeMat.columns[i] in G3: groundTruth[i] = 2 elif edgeMat.columns[i] in G4: groundTruth[i] = 3 elif edgeMat.columns[i] in G5: groundTruth[i] = 4 else: groundTruth[i] = 5
np.unique(ix_a).shape # Num groups: 2654509 return ix_a, _a """c""" a = np.column_stack((i_a0, i_a1)) G = nx.Graph() G.add_edges_from(a) G.add_nodes_from(i_ex0) a = nx.to_pandas_dataframe(G) # Number of nodes: 5654509 nx.number_connected_components(G) # 2654509 # Fast: l_out0 = [] iCount = 100000 for x in nx.connected_components(G): l_out0.append(x) iCount = iCount - 1 if iCount == 0:
node_color = values, node_size = 500) nx.draw_networkx_labels(G, pos) options = { 'node_color': 'blue', 'node_size': 700, 'width': 2, 'arrowstyle': '-|>', 'arrowsize': 7, } nx.draw_networkx_edges(G, pos, edgelist=red_edges, edge_color='r', arrows=True) nx.draw_networkx_edges(G, pos, edgelist=black_edges, arrows=True, **options) nx.draw_networkx(G, pos, edgelist=black_edges, arrows=True, **options) plt.show() print len(G.adjacency_list()) print len(G.nodes()) nx.to_edgelist(G) nx.to_dict_of_dicts(G) print nx.to_dict_of_lists(G) adjacentMat = nx.to_pandas_dataframe(G) # shows the existing directions print adjacentMat col = adjacentMat['F'] row = adjacentMat.loc['F'] print col print row totalNeighbours = col + row G.neighbors('F') G.predecessors('F') nx.predecessor(G,'F') G.successors('F')
(pos + 1):], np.concatenate( (t['weight'][:pos], t['weight'][(pos + 1):])) seq.insert(0, current) else: pos = t['target'].index(end) end = current end_t, end_d = t['target'][:pos] + t['target'][ (pos + 1):], np.concatenate( (t['weight'][:pos], t['weight'][(pos + 1):])) seq.append(current) targets = start_t + end_t degree = np.concatenate((start_d, end_d)) return seq g_ppi2 = nx.read_gpickle('D:/PPI-Topic/Processed_data/g_ppi_newdeg.csv') di_ppi2 = nx.to_dict_of_lists(g_ppi2) c_ppi2 = nx.to_pandas_dataframe(g_ppi2) seq = [] net_ppi2 = transform(di_ppi2, c_ppi2) for node in g_ppi2.nodes(): for i in range(WALK_TIME): seq.append(random_walk(node, net_ppi2) + ['#']) # print('node',node,'complete') lexico = sorted(g_ppi2.nodes(), key=g_ppi2.degree, reverse=True) lexico1 = {e: lexico.index(e) for e in lexico} lexico1['#'] = -1 seq_concat = list(itertools.chain.from_iterable(seq)) seq_int = [lexico1[e] for e in seq_concat]