def test_from_edgelist_multidigraph_and_edge_attr(self): # example from issue #2374 Gtrue = nx.MultiDiGraph([('X1', 'X4', {'Co': 'zA', 'Mi': 0, 'St': 'X1'}), ('X1', 'X4', {'Co': 'zB', 'Mi': 54, 'St': 'X2'}), ('X1', 'X4', {'Co': 'zB', 'Mi': 49, 'St': 'X3'}), ('X1', 'X4', {'Co': 'zB', 'Mi': 44, 'St': 'X4'}), ('Y1', 'Y3', {'Co': 'zC', 'Mi': 0, 'St': 'Y1'}), ('Y1', 'Y3', {'Co': 'zC', 'Mi': 34, 'St': 'Y2'}), ('Y1', 'Y3', {'Co': 'zC', 'Mi': 29, 'St': 'X2'}), ('Y1', 'Y3', {'Co': 'zC', 'Mi': 24, 'St': 'Y3'}), ('Z1', 'Z3', {'Co': 'zD', 'Mi': 0, 'St': 'Z1'}), ('Z1', 'Z3', {'Co': 'zD', 'Mi': 14, 'St': 'X3'}), ('Z1', 'Z3', {'Co': 'zE', 'Mi': 9, 'St': 'Z2'}), ('Z1', 'Z3', {'Co': 'zE', 'Mi': 4, 'St': 'Z3'})]) df = pd.DataFrame.from_items([ ('O', ['X1', 'X1', 'X1', 'X1', 'Y1', 'Y1', 'Y1', 'Y1', 'Z1', 'Z1', 'Z1', 'Z1']), ('D', ['X4', 'X4', 'X4', 'X4', 'Y3', 'Y3', 'Y3', 'Y3', 'Z3', 'Z3', 'Z3', 'Z3']), ('St', ['X1', 'X2', 'X3', 'X4', 'Y1', 'Y2', 'X2', 'Y3', 'Z1', 'X3', 'Z2', 'Z3']), ('Co', ['zA', 'zB', 'zB', 'zB', 'zC', 'zC', 'zC', 'zC', 'zD', 'zD', 'zE', 'zE']), ('Mi', [0, 54, 49, 44, 0, 34, 29, 24, 0, 14, 9, 4])]) G1 = nx.from_pandas_edgelist(df, source='O', target='D', edge_attr=True, create_using=nx.MultiDiGraph()) G2 = nx.from_pandas_edgelist(df, source='O', target='D', edge_attr=['St', 'Co', 'Mi'], create_using=nx.MultiDiGraph()) assert_graphs_equal(G1, Gtrue) assert_graphs_equal(G2, Gtrue)
def test_from_edgelist_all_attr(self): Gtrue = nx.Graph([('E', 'C', {'cost': 9, 'weight': 10}), ('B', 'A', {'cost': 1, 'weight': 7}), ('A', 'D', {'cost': 7, 'weight': 4})]) G = nx.from_pandas_edgelist(self.df, 0, 'b', True) assert_graphs_equal(G, Gtrue) # MultiGraph MGtrue = nx.MultiGraph(Gtrue) MGtrue.add_edge('A', 'D', cost=16, weight=4) MG = nx.from_pandas_edgelist(self.mdf, 0, 'b', True, nx.MultiGraph()) assert_graphs_equal(MG, MGtrue)
def tnet_to_nx(df, t=None): """ Creates undirected networkx object """ if t is not None: df = get_network_when(df, t=t) if 'weight' in df.columns: nxobj = nx.from_pandas_edgelist( df, source='i', target='j', edge_attr='weight') else: nxobj = nx.from_pandas_edgelist(df, source='i', target='j') return nxobj
def test_from_edgelist_int_attr_name(self): # note: this also tests that edge_attr can be `source` Gtrue = nx.Graph([('E', 'C', {0: 'C'}), ('B', 'A', {0: 'B'}), ('A', 'D', {0: 'A'})]) G = nx.from_pandas_edgelist(self.df, 0, 'b', 0) assert_graphs_equal(G, Gtrue)
def test_roundtrip(self): # edgelist Gtrue = nx.Graph([(1, 1), (1, 2)]) df = nx.to_pandas_edgelist(Gtrue) G = nx.from_pandas_edgelist(df) assert_graphs_equal(Gtrue, G) # adjacency Gtrue = nx.Graph(({1: {1: {'weight': 1}, 2: {'weight': 1}}, 2: {1: {'weight': 1}}})) df = nx.to_pandas_adjacency(Gtrue, dtype=int) G = nx.from_pandas_adjacency(df) assert_graphs_equal(Gtrue, G)
def download_reactome_fi(): """ Downloads reactome functional interaction network Returns ------- """ url = 'http://reactomews.oicr.on.ca:8080/caBigR3WebApp2017/' \ 'FIsInGene_071718_with_annotations.txt.zip' table = pd.read_csv(io.BytesIO(urlopen(url).read()), compression='zip', delimiter='\t', error_bad_lines=False, encoding='utf-8' ) table = table[table['Direction'] != '-'] table = table[~table['Annotation'].str.contains('indirect effect')] table = table[~table['Annotation'].str.contains('predicted')] table = table[~table['Annotation'].str.contains('compound')] genes = set(table['Gene1']) genes.update(set(table['Gene2'])) from magine.mappings.gene_mapper import GeneMapper gm = GeneMapper() missing_uniprot = set(i for i in genes if i not in gm.gene_name_to_uniprot) table = table[~table['Gene1'].isin(missing_uniprot)] table = table[~table['Gene2'].isin(missing_uniprot)] table['source'] = table['Gene1'] table['target'] = table['Gene2'] table['databaseSource'] = 'ReactomeFI' rev_cols = table['Direction'].isin(_reverse) table.loc[rev_cols, ['source', 'target']] = \ table.loc[rev_cols, ['target', 'source']].values table['interactionType'] = table.apply(standardize_edge_types, axis=1) protein_graph = nx.from_pandas_edgelist( table, 'source', 'target', edge_attr=['interactionType', 'databaseSource'], create_using=nx.DiGraph() ) species = set(table['source'].unique() ).union(set(table['target'].unique())) # add names to graph for node in species: protein_graph.add_node(node, databaseSource='ReactomeFI', speciesType='gene') print("Reactome network has {} nodes and {} edges" "".format(len(protein_graph.nodes()), len(protein_graph.edges()))) nx.write_gpickle(protein_graph, _p_name)
def PageRank(data): print('graph generating...') G_ui = nx.from_pandas_edgelist(df=data, source='user_id', target='item_id', edge_attr=False) pagerank = pd.DataFrame(list(nx.pagerank(G_ui).items()), columns=['node', 'pagerank']) print('merging...') data = data.merge(pagerank, left_on='user_id', right_on='node', how='left').merge(pagerank, left_on='item_id', right_on='node', how='left') pagerank_data = pd.DataFrame(columns=['instance_id', 'user_pagerank', 'item_pagerank']) pagerank_data['instance_id'] = data['instance_id'] pagerank_data['user_pagerank'] = data['pagerank_x'] pagerank_data['item_pagerank'] = data['pagerank_y'] return pagerank_data
def test_from_edgelist(self): # Pandas DataFrame g = nx.cycle_graph(10) G = nx.Graph() G.add_nodes_from(g) G.add_weighted_edges_from((u, v, u) for u, v in g.edges()) edgelist = nx.to_edgelist(G) source = [s for s, t, d in edgelist] target = [t for s, t, d in edgelist] weight = [d['weight'] for s, t, d in edgelist] edges = pd.DataFrame({'source': source, 'target': target, 'weight': weight}) GG = nx.from_pandas_edgelist(edges, edge_attr='weight') assert_nodes_equal(G.nodes(), GG.nodes()) assert_edges_equal(G.edges(), GG.edges()) GW = nx.to_networkx_graph(edges, create_using=nx.Graph()) assert_nodes_equal(G.nodes(), GW.nodes()) assert_edges_equal(G.edges(), GW.edges())
def download_trrust(): table = pd.read_csv(url, names=['source', 'target', 'interactionType', 'pmid'], delimiter='\t', index_col=None, error_bad_lines=False, encoding='utf-8' ) print(table.head(10)) # filter out non d table = table[~(table['interactionType'] == 'Unknown')].copy() table.loc[table[ 'interactionType'] == 'Activation', 'interactionType'] = 'activate|expression' table.loc[table[ 'interactionType'] == 'Repression', 'interactionType'] = 'inhibit|repression' table = table[~(table['interactionType'] == 'Unknown')].copy() table['databaseSource'] = 'TRRUST' protein_graph = nx.from_pandas_edgelist( table, 'source', 'target', edge_attr=['interactionType', 'pmid'], create_using=nx.DiGraph() ) table = table[['source', 'target']].values added_genes = set() def _add_node(node): if node not in added_genes: protein_graph.add_node(node, databaseSource='TRRUST', speciesType='gene') added_genes.add(node) # add names to graph for r in table: _add_node(r[0]) _add_node(r[1]) nx.write_gpickle(protein_graph, _p_name)
def __init__(self, ctfile, data=None, name='', file=None, **attr): # Read in the CT file as a pandas DataFrame. self.ctfile = ctfile edge_df = pd.read_fwf(ctfile, skiprows=1, header=None) edge_df.columns = ['position', 'letter', 'pos-1', 'pos+1', 'pair_pos', 'position_repeated'] edge_df['letter'] = \ edge_df['letter'].apply(lambda x: x.upper().replace("T", "U")) edge_df['kind'] = 'basepair' del edge_df['position_repeated'] # Construct graph self.graph = nx.from_pandas_edgelist(edge_df, source='position', target='pair_pos', edge_attr='kind') # Annotate pairing partners on nodes. for n1, n2, d in self.graph.edges(data=True): self.graph.node[n1]['pairing_partner'] = n2 self.graph.node[n2]['pairing_partner'] = n1 # Add in backbone edges for n1, n2 in zip(sorted(self.graph.nodes()), sorted(self.graph.nodes())[1:]): self.graph.add_edge(n1, n2, kind='backbone') # Add in node metadata for r, d in edge_df.iterrows(): self.graph.node[d['position']]['letter'] = d['letter'] # Remove the unnecessary node zero. self.graph.remove_node(0) # Annotate the graph with vectorized features. self.annotate() self._edge_df = edge_df # miRNA name self.mirna_name = ctfile.split('/')[-1].split('_')[0]
def test_from_edgelist_multi_attr(self): Gtrue = nx.Graph([('E', 'C', {'cost': 9, 'weight': 10}), ('B', 'A', {'cost': 1, 'weight': 7}), ('A', 'D', {'cost': 7, 'weight': 4})]) G = nx.from_pandas_edgelist(self.df, 0, 'b', ['weight', 'cost']) assert_graphs_equal(G, Gtrue)
def test_from_edgelist_multi_attr_incl_target(self): Gtrue = nx.Graph([('E', 'C', {0: 'C', 'b': 'E', 'weight': 10}), ('B', 'A', {0: 'B', 'b': 'A', 'weight': 7}), ('A', 'D', {0: 'A', 'b': 'D', 'weight': 4})]) G = nx.from_pandas_edgelist(self.df, 0, 'b', [0, 'b', 'weight']) assert_graphs_equal(G, Gtrue)
return nx.convert_node_labels_to_integers(graph) if __name__ == "__main__": default_fname = "BIOGRID-ORGANISM-Human_Herpesvirus_6B-3.5.165.tab2_duplicate.txt" fname, do_centrality, do_draw, do_info = parse(default_fname) starttime = time.time() df_ppin = load_ppin(fname) colA_name, colB_name = "BioGRID ID Interactor A", "BioGRID ID Interactor B" colOffA_name, colOffB_name = "Official Symbol Interactor A", "Official Symbol Interactor B" # draw graph graph = nx.from_pandas_edgelist( df_ppin[[colA_name, colB_name]], colA_name, colB_name) # need to give a directionality here - just ignore graph.remove_edges_from( graph.selfloop_edges()) # gets rid of self loops (A->A) graph = graph.to_undirected( ) # gets rid of duplicates (A->B, A->B) and inverse duplicates (A->B, B->A) print("building graph took " + str(round(time.time() - starttime, 5)) + " s") #print(fname, graph.number_of_nodes(), graph.number_of_edges(), time.time() - starttime) # for import into csv nx.write_edgelist(graph, "../biograd-organism/ppin/" + fname + ".edgeList", delimiter='\t') # save correspondence between biogrid ID and official symbol interactorA = flatten(df_ppin, colA_name) officialSymbolA = flatten(df_ppin, colOffA_name)
def _thin_network(self): ''' Returns a network with elgible edges merged). ''' self._report_duplicate_edges() cols = self.config['intermediate_keep_columns'] + self.config[ 'dir_columns'] + self.config['dir_toll_columns'] # need to remove any links that are one-way, # but share the reverse node sequence # these are merged back in after thinning. thin_edges = self.network_gdf.copy() merge_edges = self.network_gdf.copy()[['INode', 'JNode']] merge_edges = merge_edges.rename(columns={ 'INode': 'INode_y', 'JNode': 'JNode_y' }) one_way_keep = thin_edges.merge(merge_edges, how='inner', left_on=['INode', 'JNode'], right_on=['JNode_y', 'INode_y']) thin_edges = thin_edges[~thin_edges['PSRCEdgeID']. isin(one_way_keep['PSRCEdgeID'].tolist())] G = nx.from_pandas_edgelist(thin_edges, 'INode', 'JNode', cols) i = 0 node_list = [x for x in self.thin_nodes_list if G.has_node(x)] for node in node_list: if i % 1000 == 0: print("%d Nodes Processed" % (i)) edges = list(G.edges(node)) check_edges = self._check_edge_connection_validity(node, edges, G) if check_edges: edge_1 = check_edges[0] edge_2 = check_edges[1] a_coords = list(edge_1['geometry'].coords) b_coords = list(edge_2['geometry'].coords) # get the first and last coord for the two edges a_test = [a_coords[0], a_coords[-1]] b_test = [b_coords[0], b_coords[-1]] if edge_1['INode'] != edge_2['INode'] and edge_1[ 'JNode'] != edge_2['JNode']: edge_dir = 'with' merge = self._compare_attributes(edge_1, edge_2, 'IJ') else: edge_dir = 'against' merge = self._compare_attributes(edge_1, edge_2, 'JI') if merge: if edge_dir == 'with': # Do the first coords match or the first and last if a_test.index( list( set(a_test).intersection(b_test))[0]) == 0: order = 'ba' a_coords.pop(0) x = b_coords + a_coords line = LineString(x) merged_row = edge_2 merged_row['geometry'] = line merged_row['JNode'] = edge_1['JNode'] if G.has_edge(merged_row['INode'], merged_row['JNode']): compare_edge = G.get_edge_data( merged_row['INode'], merged_row['JNode']) if list(compare_edge['geometry'].coords) == x: print 'True' G.remove_edge(edges[0][0], edges[0][1]) G.remove_edge(edges[1][0], edges[1][1]) else: G.remove_edge(edges[0][0], edges[0][1]) G.remove_edge(edges[1][0], edges[1][1]) G.add_edge(merged_row['INode'], merged_row['JNode'], **merged_row) else: order = 'ab' b_coords.pop(0) x = a_coords + b_coords line = LineString(x) merged_row = edge_1 merged_row['geometry'] = line merged_row['JNode'] = edge_2['JNode'] if G.has_edge(merged_row['INode'], merged_row['JNode']): compare_edge = G.get_edge_data( merged_row['INode'], merged_row['JNode']) if list(compare_edge['geometry'].coords) == x: print 'True' G.remove_edge(edges[0][0], edges[0][1]) G.remove_edge(edges[1][0], edges[1][1]) else: G.remove_edge(edges[0][0], edges[0][1]) G.remove_edge(edges[1][0], edges[1][1]) G.add_edge(merged_row['INode'], merged_row['JNode'], **merged_row) # Are lines digitized towards each other: elif edge_1['JNode'] == edge_2['JNode']: # Flip the b line b_coords.reverse() # Drop the duplicate coord b_coords.pop(0) x = a_coords + b_coords line = LineString(x) merged_row = edge_1 merged_row['geometry'] = line merged_row['INode'] = edge_1['INode'] merged_row['JNode'] = edge_2['INode'] if G.has_edge(merged_row['INode'], merged_row['JNode']): compare_edge = G.get_edge_data( merged_row['INode'], merged_row['JNode']) if list(compare_edge['geometry'].coords) == x: print 'True' G.remove_edge(edges[0][0], edges[0][1]) G.remove_edge(edges[1][0], edges[1][1]) else: G.remove_edge(edges[0][0], edges[0][1]) G.remove_edge(edges[1][0], edges[1][1]) G.add_edge(merged_row['INode'], merged_row['JNode'], **merged_row) # Lines must be digitized away from each other: else: # Drop the duplicate coord b_coords.pop(0) # Flip the b line b_coords.reverse() x = b_coords + a_coords line = LineString(x) merged_row = edge_1 merged_row['geometry'] = line merged_row['INode'] = edge_2['JNode'] merged_row['JNode'] = edge_1['JNode'] if G.has_edge(merged_row['INode'], merged_row['JNode']): compare_edge = G.get_edge_data( merged_row['INode'], merged_row['JNode']) if list(compare_edge['geometry'].coords) == x: G.remove_edge(edges[0][0], edges[0][1]) G.remove_edge(edges[1][0], edges[1][1]) else: G.remove_edge(edges[0][0], edges[0][1]) G.remove_edge(edges[1][0], edges[1][1]) G.add_edge(merged_row['INode'], merged_row['JNode'], **merged_row) i = i + 1 edge_list = [] for x in G.edges.iteritems(): edge_list.append(x[1]) gdf = gpd.GeoDataFrame(edge_list) gdf = gdf.append(one_way_keep[cols]) return (gdf)
JOIN theta_plus.imm1985_1995_article_score_unshuffled asu ON asu.scp = cslu.scp WHERE asu.article_score >= 1) cslu1 ON cslu1.scp = ccu.citing JOIN (SELECT cslu.* FROM theta_plus.imm1985_1995_cluster_scp_list_mcl cslu JOIN theta_plus.imm1985_1995_article_score_unshuffled asu ON asu.scp = cslu.scp WHERE asu.article_score >= 1) cslu2 ON cslu2.scp = ccu.cited WHERE cslu1.cluster_no!=cslu2.cluster_no AND cslu2.cluster_no= """ + str( cluster_num) + """; -- all external in-degrees""" cluster_scp_query = """SELECT * FROM theta_plus.imm1985_1995_cluster_scp_list_mcl WHERE cluster_no = """ + str(cluster_num) + """;""" citing_cited = pd.read_sql(citing_cited_query, con=engine) G = nx.from_pandas_edgelist(citing_cited, 'citing', 'cited', create_using=nx.DiGraph()) N = G.order() degrees = dict(G.degree()) total_deg = pd.DataFrame.from_dict(degrees, orient='index', columns=['ext_cluster_total_degrees']) total_deg['scp'] = total_deg.index total_deg = total_deg.reset_index(drop=True) indegrees = dict(G.in_degree()) total_in_deg = pd.DataFrame.from_dict(indegrees, orient='index', columns=['ext_cluster_in_degrees']) total_in_deg['scp'] = total_in_deg.index total_in_deg = total_in_deg.reset_index(drop=True)
def create_diffusion_graph(twitter_corpus_file, diffusion_graph_file): diffusion_graph_dir = '/'.join(diffusion_graph_file.split('/')[:-1]) + '/' #initialize graph G = nx.DiGraph() for v in institutions['URL'].tolist(): G.add_edge(v, graph_nodes['institution']) for v in repositories['URL'].tolist(): G.add_edge(v, graph_nodes['repository']) G.add_edge(graph_nodes['institution'], graph_nodes['source']) G.add_edge(graph_nodes['repository'], graph_nodes['source']) epoch = 0 frontier = [] connected_components = 0 last_pass = False while True: #expand graph if not os.path.exists(diffusion_graph_dir + 'epoch_' + str(epoch) + '.tsv'): graph_epoch_n(frontier, epoch, last_pass, twitter_corpus_file, diffusion_graph_dir) df = pd.read_csv(diffusion_graph_dir + 'epoch_' + str(epoch) + '.tsv', sep='\t').dropna() G = nx.compose( G, nx.from_pandas_edgelist(df, source='source_url', target='target_url', create_using=nx.DiGraph())) frontier = [x for x in G.nodes() if G.out_degree(x) == 0] print('Epoch:', epoch) print('Connected Components:', nx.number_connected_components(G.to_undirected())) print('Frontier Size:', len(frontier)) if last_pass: break #last pass condition if epoch != 0 and (connected_components - nx.number_connected_components(G.to_undirected()) ) / connected_components < components_ratio: last_pass = True connected_components = nx.number_connected_components( G.to_undirected()) epoch += 1 #add root node df = pd.read_csv(diffusion_graph_dir + 'epoch_0.tsv', sep='\t').dropna() df['social'] = project_url + '#twitter' G = nx.compose( G, nx.from_pandas_edgelist(df, source='social', target='source_url', create_using=nx.DiGraph())) write_graph(G, diffusion_graph_file)
file = filenames[i] # split into variables confidence, graphs, method, edgefunc = file.split('.')[0].split('_') confidence = confidence[10:] graphs = graphs[7:] if 'fiberlength' in file: df = pd.read_csv(dir_ + file, delimiter=';') df = df.rename(columns={'edge weight(med flm)': 'weight'}) df['len'] = df['weight'] # generate digraph H = nx.from_pandas_edgelist(df, source='id node1', target='id node2', edge_attr=['weight', 'len'], create_using=nx.Graph()) # remove self loops H.remove_edges_from(nx.selfloop_edges(H)) # add nodes even though they have no edges (to make comparison more fair) if confidence == '20': full_set = set(H) else: H.add_nodes_from(full_set - set(H)) # for centrality cc = nx.closeness_centrality(H) cc = {k: v for k, v in sorted(cc.items(), key=lambda s: s[0])} closeness_centrality.append(list(cc.values()))
import numpy as np import matplotlib.pyplot as plt from modularitydensity import metrics from modularitydensity.fine_tuned_modularity import fine_tuned_clustering_q from modularitydensity.fine_tuned_modularity_density import fine_tuned_clustering_qds def mapname(name): print(name) return name.lower() df = pd.read_csv('cc9_rel_undirected_nozeroes.csv') df = df.rename(mapper=mapname, axis='columns') print(df) G = nx.from_pandas_edgelist(df, edge_attr=['weight', 'change']) # G = nx.les_miserables_graph() G = nx.relabel.convert_node_labels_to_integers(G) print(G) adj = nx.to_scipy_sparse_matrix(G) for gr in nx.connected_component_subgraphs(G): # Nodes of the subgraph 'gr' nodes_gr = list(gr) print(nodes_gr) c = fine_tuned_clustering_q(G) print(c) Q = metrics.modularity_r(adj, c, np.unique(c), r=0)
def build_and_save_edgefile( db, filename='edges.csv', business_category=None, state=None, city=None, backbone_extract=True, backbone_alpha=0.4, ): """ Builds a business-review network for the given geographic context and saves the edgelist to a file Parameterss ---------- db : pymongo object The pymongo object that can be used to query the database filename : string Path of the file to save the graph edgelist state : string The state within which to query businesses, e.g., AZ, NC, PA city : string The city within which to query businesses, e.g., 'Phoenix', 'Charlotte' business_category: The business category to query for, e.g., Restraunt, Cafe, etc. backbone_extract: boolean True if backbone extraction should be applied to the constructed network backbone_alpha: double Between 0 and 1. If backbone_extraxt is True, this is the alpha valued that is used to determine how aggreeively to prune edges. Low values are more aggressive Returns ------- out: A networkx object constructed using the given geogrpahic context """ import pandas as pd import networkx as nx from .network_utils import extract_backbone print("Building business-review list") business_review_list = buildBusinessUserList( db, business_category=business_category, state=state, city=city) print("Building edgelist") edgelist = buildEdgeList(business_review_list, column_name="businesses", threshold=1) edge_df = pd.DataFrame(edgelist) G = nx.from_pandas_edgelist(edge_df, edge_attr=True) if (backbone_extract): print("Extracting backbone") G = extract_backbone(G, backbone_alpha) # Save edge list print('Saving the file') nx.write_edgelist(G, filename) print('Done') # Done, return the graph object return (G)
def __buid_graph(self, crowdtangle_shares_df, coordinated_shares_df, percentile_edge_weight=90, timestamps=False): logger.info("Bulding graph") coord_df = coordinated_shares_df[['account_url', 'url', 'share_date']].reset_index(drop=True) coord_graph = nx.from_pandas_edgelist(coord_df, 'account_url', 'url', create_using=nx.DiGraph()) # Remove self loop node edges coord_graph.remove_edges_from(nx.selfloop_edges(coord_graph)) #Bipartite graph creation account_urls = list(coordinated_shares_df['account_url'].unique()) urls = list(coordinated_shares_df['url'].unique()) bipartite_graph = nx.Graph() logger.debug('adding nodes') bipartite_graph.add_nodes_from(urls, bipartite=0) bipartite_graph.add_nodes_from(account_urls, bipartite=1) logger.debug('Adding edges') for index, row in coord_df.iterrows(): bipartite_graph.add_edge(row['account_url'], row['url'], share_date=row['share_date']) #Graph projection with account nodes logger.debug('Projecting graph') full_graph = bipartite.weighted_projected_graph( bipartite_graph, account_urls) #pandas helper dataframe to calcule graph node attribues crowdtangle_shares_df['account_name'] = crowdtangle_shares_df[ 'account_name'].astype(str) crowdtangle_shares_df['account_handle'] = crowdtangle_shares_df[ 'account_handle'].astype(str) crowdtangle_shares_df[ 'account_pageAdminTopCountry'] = crowdtangle_shares_df[ 'account_pageAdminTopCountry'].astype(str) crowtangle_shares_gb = crowdtangle_shares_df.groupby('account_url') crowdtangle_shares_df['name_changed'] = ( crowtangle_shares_gb['account_name'].transform("nunique")) > 1 crowdtangle_shares_df['handle_changed'] = ( crowtangle_shares_gb['account_handle'].transform("nunique")) > 1 crowdtangle_shares_df['page_admin_top_country_changed'] = ( crowtangle_shares_gb['account_pageAdminTopCountry'].transform( "nunique")) > 1 crowdtangle_shares_df['account_name'] = crowtangle_shares_gb[ 'account_name'].transform(lambda col: '|'.join(col.unique())) crowdtangle_shares_df['account_handle'] = crowtangle_shares_gb[ 'account_handle'].transform(lambda col: '|'.join(col.unique())) crowdtangle_shares_df[ 'account_pageAdminTopCountry'] = crowtangle_shares_gb[ 'account_pageAdminTopCountry'].transform( lambda col: '|'.join(col.unique())) crowdtangle_shares_df[[ 'account_name', 'account_handle', 'account_pageAdminTopCountry', 'name_changed', 'handle_changed', 'page_admin_top_country_changed' ]] crowtangle_shares_gb = crowdtangle_shares_df.reset_index().groupby( ['account_url']) account_info_df = crowtangle_shares_gb['index'].agg([('shares', 'count')]) account_info_df = account_info_df.merge(pd.DataFrame( crowtangle_shares_gb['is_coordinated'].apply( lambda x: (x == True).sum())).rename( columns={'is_coordinated': 'coord_shares'}), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['account_subscriberCount'].agg([ ('avg_account_subscriber_count', 'mean') ]), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['account_name'].agg([('account_name', 'first') ]), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['name_changed'].agg('first'), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['handle_changed'].agg('first'), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['page_admin_top_country_changed'].agg( 'first'), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['account_pageAdminTopCountry'].agg([ ('account_page_admin_top_country', 'first') ]), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['account_handle'].agg([('account_handle', 'first')]), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['account_platform'].agg([('account_platform', 'first')]), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['account_platformId'].agg([ ('account_platformId', 'first') ]), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['account_verified'].agg([('account_verified', 'first')]), left_index=True, right_index=True) account_info_df = account_info_df.merge( crowtangle_shares_gb['account_accountType'].agg([ ('account_account_type', 'first') ]), left_index=True, right_index=True) account_info_df = account_info_df.reset_index().rename( columns={'account_url': 'account_url'}) #filter the dataframe with the graph nodes node_info_df = account_info_df[account_info_df['account_url'].isin( list(full_graph.nodes))] attributes = [] for node in full_graph.nodes(): records = node_info_df[node_info_df['account_url'] == node] attributes.append(node) attributes.append({ 'shares': records['shares'].values[0], 'coord_shares': records['coord_shares'].values[0], 'avg_account_subscriber_count': records['avg_account_subscriber_count'].values[0], 'account_platform': records['account_platform'].values[0], 'account_name': records['account_name'].values[0], 'account_verified': 1 if records['account_verified'].values[0] else 0, 'account_handle': records['account_handle'].values[0], 'name_changed': 1 if records['name_changed'].values[0] else 0, 'handle_changed': 1 if records['handle_changed'].values[0] else 0, 'page_admin_top_country_changed': 1 if records['page_admin_top_country_changed'].values[0] else 0, 'account_page_admin_top_country': records['account_page_admin_top_country'].values[0], 'account_account_type': records['account_account_type'].values[0] }) #update graph attributes it = iter(attributes) nx.set_node_attributes(full_graph, dict(zip(it, it))) #set the percentile_edge_weight number of repetedly coordinated link sharing to keep q = np.percentile( [d['weight'] for (u, v, d) in full_graph.edges(data=True)], percentile_edge_weight) #create a new graph where node degree > 0 highly_connected_graph = full_graph.subgraph( [key for (key, value) in full_graph.degree if value > 0]).copy() #remove where the edge weitght is less than the given percentile value edges_to_remove = [ (u, v) for (u, v, d) in highly_connected_graph.edges(data=True) if d['weight'] < q ] highly_connected_graph.remove_edges_from(edges_to_remove) highly_connected_graph.remove_nodes_from( list(nx.isolates(highly_connected_graph))) if timestamps: logger.info("Calculating nodes timestamps") vec_func = np.vectorize( lambda u, v: bipartite_graph.get_edge_data(u, v)['share_date']) attributes = [] for (u, v) in highly_connected_graph.edges(): attributes.append((u, v)) attributes.append({ "timestamp_coord_share": vec_func( np.intersect1d( list(list(bipartite_graph.neighbors(u))), list(list(bipartite_graph.neighbors(v)))), u) }) it = iter(attributes) nx.set_edge_attributes(highly_connected_graph, dict(zip(it, it))) logger.info("timestamps calculated") #find and annotate nodes-components connected_components = list( nx.connected_components(highly_connected_graph)) components_df = pd.DataFrame({ "node": connected_components, "component": [*range(1, len(connected_components) + 1)] }) components_df['node'] = components_df['node'].apply(lambda x: list(x)) components_df = components_df.explode('node') #add cluster to simplyfy the analysis of large components cluster_df = pd.DataFrame( community_louvain.best_partition(highly_connected_graph).items(), columns=['node', 'cluster']) #re-calculate the degree on the graph degree_df = pd.DataFrame(list(highly_connected_graph.degree()), columns=['node', 'degree']) #sum up the edge weights of the adjacent edges for each node strength_df = pd.DataFrame(list( highly_connected_graph.degree(weight='weight')), columns=['node', 'strength']) attributes_df = components_df.merge(cluster_df, on='node').merge( degree_df, on='node').merge(strength_df, on='node') #update graph attribues nx.set_node_attributes( highly_connected_graph, attributes_df.set_index('node').to_dict('index')) logger.info("graph builded") return highly_connected_graph, q
else: w = G[u][v].get(weight, 1) except KeyError: w = 0 # Double count self-loops if the graph is undirected. if u == v and not directed: w *= 2 return w - in_degree[u] * out_degree[v] * norm Q = sum(val(u, v) for c in communities for u, v in product(c, repeat=2)) return Q * norm # graph node and mmCCnode # graph node and mmCCnode G2 = nx.from_pandas_edgelist(cc, 'node', 'mmCCNode') #local brebjes #nx.draw_networkx(G2) #plt.show() # find the conencted componets of the new graph and is the chromosome # the initial popoulation with2 is concomp concomp = list(list(nx.connected_components(G2))) numCluster = len(concomp) def CrossFirstGen(): for i in range(1, popoulationInit.shape[1] - 1): ran = random.randint( 1, (popoulationInit.shape[1] - 2)) #is the random number
datasets = [ "flickrEdges_adj.tsv", "email-EuAll_adj.tsv", "roadNet-TX_adj.tsv", "roadNet-PA.adj.tsv", "roadNet-CA_adj.tsv" ] for data in datasets: fileE = pd.read_csv(data, sep='\t') #read from file file Title = fileE.columns print(Title) fileE = fileE.rename(columns={ Title[0]: 'Source', Title[1]: 'Target', Title[2]: 'Degree' }) Gs = nx.from_pandas_edgelist(fileE, source='Source', target='Target') Ga = Gs.to_directed() centrality = nx.eigenvector_centrality(Ga, max_iter=20) sorted((v, f"{c:0.2f}") for v, c in centrality.items()) fileE['Eigenvalue'] = np.nan n = len(fileE['Source']) dataS = fileE['Source'] dataE = fileE['Eigenvalue'] dataD = fileE['Degree'] dataE = np.array(dataE).reshape((len(dataE), 1)) dataS = np.array(dataS).reshape((len(dataS), 1)) Data = np.hstack((dataE, dataS)) data = pd.DataFrame(Data, columns=['Eigen', 'Source']) data = data.fillna(0) dataEigen = np.array(data['Eigen']) threshold = dataEigen.mean()
def _boiler_generator_assn(eia_transformed_dfs, eia923_years=pc.working_years['eia923'], eia860_years=pc.working_years['eia860'], debug=False): """ Creates a set of more complete boiler generator associations. Creates a unique unit_id_pudl for each collection of boilers and generators within a plant that have ever been associated with each other, based on the boiler generator associations reported in EIA860. Unfortunately, this information is not complete for years before 2014, as the gas turbine portion of combined cycle power plants in those earlier years were not reporting their fuel consumption, or existence as part of the plants. For years 2014 and on, EIA860 contains a unit_id_eia value, allowing the combined cycle plant compoents to be associated with each other. For many plants not listed in the reported boiler generator associations, it is nonetheless possible to associate boilers and generators on a one-to-one basis, as they use identical strings to describe the units. In the end, between the reported BGA table, the string matching, and the unit_id_eia values, it's possible to create a nearly complete mapping of the generation units, at least for 2014 and later. Args: eia_transformed_dfs (dict): a dictionary of post-transform dataframes representing the EIA database tables. eia923_years (list-like): a list of the years of EIA 923 data that should be used to infer the boiler-generator associations. By default it is all the working years of data. eia860_years (list-like): a list of the years of EIA 860 data that should be used to infer the boiler-generator associations. By default it is all the working years of data. debug (bool): If True, include columns in the returned dataframe indicating by what method the individual boiler generator associations were inferred. Returns: eia_transformed_dfs (dict): Returns the same dictionary of dataframes that was passed in, and adds a new dataframe to it representing the boiler-generator associations as records containing plant_id_eia, generator_id, boiler_id, and unit_id_pudl Raises: AssertionError: If the boiler - generator association graphs are not bi-partite, meaning generators only connect to boilers, and boilers only connect to generators. AssertionError: If all the boilers do not end up with the same unit_id each year. AssertionError: If all the generators do not end up with the same unit_id each year. """ # if you're not ingesting both 860 and 923, the bga is not compilable if not (eia860_years and eia923_years): return # compile and scrub all the parts logger.info("Inferring complete EIA boiler-generator associations.") bga_eia860 = eia_transformed_dfs['boiler_generator_assn_eia860'].copy() bga_eia860 = _restrict_years(bga_eia860, eia923_years, eia860_years) bga_eia860['generator_id'] = bga_eia860.generator_id.astype(str) bga_eia860['boiler_id'] = bga_eia860.boiler_id.astype(str) # bga_eia860 = bga_eia860.drop(['utility_id_eia'], axis=1) gen_eia923 = eia_transformed_dfs['generation_eia923'].copy() gen_eia923 = _restrict_years(gen_eia923, eia923_years, eia860_years) gen_eia923['generator_id'] = gen_eia923.generator_id.astype(str) gen_eia923 = gen_eia923.set_index(pd.DatetimeIndex(gen_eia923.report_date)) gen_eia923_gb = gen_eia923.groupby( [pd.Grouper(freq='AS'), 'plant_id_eia', 'generator_id']) gen_eia923 = gen_eia923_gb['net_generation_mwh'].sum().reset_index() gen_eia923['missing_from_923'] = False # compile all of the generators gens_eia860 = eia_transformed_dfs['generators_eia860'].copy() gens_eia860 = _restrict_years(gens_eia860, eia923_years, eia860_years) gens_eia860['generator_id'] = gens_eia860.generator_id.astype(str) gens = pd.merge(gen_eia923, gens_eia860, on=['plant_id_eia', 'report_date', 'generator_id'], how='outer') gens = gens[[ 'plant_id_eia', 'report_date', 'generator_id', 'unit_id_eia', 'net_generation_mwh', 'missing_from_923' ]].drop_duplicates() gens['generator_id'] = gens['generator_id'].astype(str) # create the beginning of a bga compilation w/ the generators as the # background bga_compiled_1 = pd.merge( gens, bga_eia860, on=['plant_id_eia', 'generator_id', 'report_date'], how='outer') # Create a set of bga's that are linked, directly from bga8 bga_assn = bga_compiled_1[bga_compiled_1['boiler_id'].notnull()].copy() bga_assn['bga_source'] = 'eia860_org' # Create a set of bga's that were not linked directly through bga8 bga_unassn = bga_compiled_1[bga_compiled_1['boiler_id'].isnull()].copy() bga_unassn = bga_unassn.drop(['boiler_id'], axis=1) # Side note: there are only 6 generators that appear in bga8 that don't # apear in gens9 or gens8 (must uncomment-out the og_tag creation above) # bga_compiled_1[bga_compiled_1['og_tag'].isnull()] bf_eia923 = eia_transformed_dfs['boiler_fuel_eia923'].copy() bf_eia923 = _restrict_years(bf_eia923, eia923_years, eia860_years) bf_eia923['boiler_id'] = bf_eia923.boiler_id.astype(str) bf_eia923['total_heat_content_mmbtu'] = \ bf_eia923['fuel_consumed_units'] * bf_eia923['fuel_mmbtu_per_unit'] bf_eia923 = bf_eia923.set_index(pd.DatetimeIndex(bf_eia923.report_date)) bf_eia923_gb = bf_eia923.groupby( [pd.Grouper(freq='AS'), 'plant_id_eia', 'boiler_id']) bf_eia923 = bf_eia923_gb.agg({ 'total_heat_content_mmbtu': pudl.helpers.sum_na, }).reset_index() bf_eia923.drop_duplicates( subset=['plant_id_eia', 'report_date', 'boiler_id'], inplace=True) # Create a list of boilers that were not in bga8 bf9_bga = bf_eia923.merge(bga_compiled_1, on=['plant_id_eia', 'boiler_id', 'report_date'], how='outer', indicator=True) bf9_not_in_bga = bf9_bga[bf9_bga['_merge'] == 'left_only'] bf9_not_in_bga = bf9_not_in_bga.drop(['_merge'], axis=1) # Match the unassociated generators with unassociated boilers # This method is assuming that some the strings of the generators and the # boilers are the same bga_unassn = bga_unassn.merge( bf9_not_in_bga[['plant_id_eia', 'boiler_id', 'report_date']], how='left', left_on=['report_date', 'plant_id_eia', 'generator_id'], right_on=['report_date', 'plant_id_eia', 'boiler_id']) bga_unassn.sort_values(['report_date', 'plant_id_eia'], inplace=True) bga_unassn['bga_source'] = None bga_unassn.loc[bga_unassn.boiler_id.notnull(), 'bga_source'] = 'string_assn' bga_compiled_2 = bga_assn.append(bga_unassn) bga_compiled_2.sort_values(['plant_id_eia', 'report_date'], inplace=True) bga_compiled_2['missing_from_923'].fillna(value=True, inplace=True) # Connect the gens and boilers in units bga_compiled_units = bga_compiled_2.loc[ bga_compiled_2['unit_id_eia'].notnull()] bga_gen_units = bga_compiled_units.drop(['boiler_id'], axis=1) bga_boil_units = bga_compiled_units[[ 'plant_id_eia', 'report_date', 'boiler_id', 'unit_id_eia' ]].copy() bga_boil_units.dropna(subset=['boiler_id'], inplace=True) # merge the units with the boilers bga_unit_compilation = bga_gen_units.merge( bga_boil_units, how='outer', on=['plant_id_eia', 'report_date', 'unit_id_eia'], indicator=True) # label the bga_source bga_unit_compilation. \ loc[bga_unit_compilation['bga_source'].isnull(), 'bga_source'] = 'unit_connection' bga_unit_compilation.drop(['_merge'], axis=1, inplace=True) bga_non_units = bga_compiled_2[bga_compiled_2['unit_id_eia'].isnull()] # combine the unit compilation and the non units bga_compiled_3 = bga_non_units.append(bga_unit_compilation) # resort the records and the columns bga_compiled_3.sort_values(['plant_id_eia', 'report_date'], inplace=True) bga_compiled_3 = bga_compiled_3[[ 'plant_id_eia', 'report_date', 'generator_id', 'boiler_id', 'unit_id_eia', 'bga_source', 'net_generation_mwh', 'missing_from_923' ]] # label plants that have 'bad' generator records (generators that have MWhs # in gens9 but don't have connected boilers) create a df with just the bad # plants by searching for the 'bad' generators bad_plants = bga_compiled_3[(bga_compiled_3['boiler_id'].isnull()) & (bga_compiled_3['net_generation_mwh'] > 0)].\ drop_duplicates(subset=['plant_id_eia', 'report_date']) bad_plants = bad_plants[['plant_id_eia', 'report_date']] # merge the 'bad' plants back into the larger frame bga_compiled_3 = bga_compiled_3.merge(bad_plants, how='outer', on=['plant_id_eia', 'report_date'], indicator=True) # use the indicator to create labels bga_compiled_3['plant_w_bad_generator'] = \ np.where(bga_compiled_3._merge == 'both', True, False) # Note: At least one gen has reported MWh in 923, but could not be # programmatically mapped to a boiler # we don't need this one anymore bga_compiled_3 = bga_compiled_3.drop(['_merge'], axis=1) # create a label for generators that are unmapped but in 923 bga_compiled_3['unmapped_but_in_923'] = \ np.where((bga_compiled_3.boiler_id.isnull()) & ~bga_compiled_3.missing_from_923 & (bga_compiled_3.net_generation_mwh == 0), True, False) # create a label for generators that are unmapped bga_compiled_3['unmapped'] = np.where(bga_compiled_3.boiler_id.isnull(), True, False) bga_out = bga_compiled_3.drop('net_generation_mwh', axis=1) bga_out.loc[bga_out.unit_id_eia.isnull(), 'unit_id_eia'] = None bga_for_nx = bga_out[[ 'plant_id_eia', 'report_date', 'generator_id', 'boiler_id', 'unit_id_eia' ]] # If there's no boiler... there's no boiler-generator association bga_for_nx = bga_for_nx.dropna(subset=['boiler_id']).drop_duplicates() # Need boiler & generator specific ID strings, or they look like # the same node to NX bga_for_nx['generators'] = 'p' + bga_for_nx.plant_id_eia.astype(str) + \ '_g' + bga_for_nx.generator_id.astype(str) bga_for_nx['boilers'] = 'p' + bga_for_nx.plant_id_eia.astype(str) + \ '_b' + bga_for_nx.boiler_id.astype(str) # dataframe to accumulate the unit_ids in bga_w_units = pd.DataFrame() # We want to start our unit_id counter anew for each plant: for pid in bga_for_nx.plant_id_eia.unique(): bga_byplant = bga_for_nx[bga_for_nx.plant_id_eia == pid].copy() # Create a graph from the dataframe of boilers and generators. It's a # multi-graph, meaning the same nodes can be connected by more than one # edge -- this allows us to preserve multiple years worth of boiler # generator association information for later inspection if need be: bga_graph = nx.from_pandas_edgelist(bga_byplant, source='generators', target='boilers', edge_attr=True, create_using=nx.MultiGraph()) # Each connected sub-graph is a generation unit: gen_units = [ bga_graph.subgraph(c).copy() for c in nx.connected_components(bga_graph) ] # Assign a unit_id to each subgraph, and extract edges into a dataframe for unit_id, unit in zip(range(len(gen_units)), gen_units): # All the boiler-generator association graphs should be bi-partite, # meaning generators only connect to boilers, and boilers only # connect to generators. if not nx.algorithms.bipartite.is_bipartite(unit): raise AssertionError( f"Non-bipartite generation unit graph found." f"plant_id_eia={pid}, unit_id_pudl={unit_id}.") nx.set_edge_attributes(unit, name='unit_id_pudl', values=unit_id + 1) new_unit_df = nx.to_pandas_edgelist(unit) bga_w_units = bga_w_units.append(new_unit_df) bga_w_units = bga_w_units.sort_values( ['plant_id_eia', 'unit_id_pudl', 'generator_id', 'boiler_id']) bga_w_units = bga_w_units.drop(['source', 'target'], axis=1) # Check whether the PUDL unit_id values we've inferred conflict with # the unit_id_eia values that were reported to EIA. Are there any PUDL # unit_id values that have more than 1 EIA unit_id_eia within them? bga_unit_id_eia_counts = \ bga_w_units.groupby(['plant_id_eia', 'unit_id_pudl'])['unit_id_eia'].\ nunique().to_frame().reset_index() bga_unit_id_eia_counts = bga_unit_id_eia_counts.rename( columns={'unit_id_eia': 'unit_id_eia_count'}) bga_unit_id_eia_counts = pd.merge(bga_w_units, bga_unit_id_eia_counts, on=['plant_id_eia', 'unit_id_pudl']) too_many_codes = \ bga_unit_id_eia_counts[bga_unit_id_eia_counts.unit_id_eia_count > 1] too_many_codes = \ too_many_codes[~too_many_codes.unit_id_eia.isnull()].\ groupby(['plant_id_eia', 'unit_id_pudl'])['unit_id_eia'].unique() for row in too_many_codes.iteritems(): logger.warning(f"Multiple EIA unit codes:" f"plant_id_eia={row[0][0]}, " f"unit_id_pudl={row[0][1]}, " f"unit_id_eia={row[1]}") bga_w_units = bga_w_units.drop('unit_id_eia', axis=1) # These assertions test that all boilers and generators ended up in the # same unit_id across all the years of reporting: pgu_gb = bga_w_units.groupby(['plant_id_eia', 'generator_id'])['unit_id_pudl'] if not (pgu_gb.nunique() == 1).all(): raise AssertionError("Inconsistent inter-annual BGA assignment!") pbu_gb = bga_w_units.groupby(['plant_id_eia', 'boiler_id'])['unit_id_pudl'] if not (pbu_gb.nunique() == 1).all(): raise AssertionError("Inconsistent inter-annual BGA assignment!") bga_w_units = bga_w_units.drop('report_date', axis=1) bga_w_units = bga_w_units[[ 'plant_id_eia', 'unit_id_pudl', 'generator_id', 'boiler_id' ]].drop_duplicates() bga_out = pd.merge(bga_out, bga_w_units, how='left', on=['plant_id_eia', 'generator_id', 'boiler_id']) bga_out['unit_id_pudl'] = (bga_out['unit_id_pudl'].fillna( value=0).astype(int)) if not debug: bga_out = bga_out[~bga_out.missing_from_923 & ~bga_out.plant_w_bad_generator & ~bga_out.unmapped_but_in_923 & ~bga_out.unmapped] bga_out = bga_out.drop([ 'missing_from_923', 'plant_w_bad_generator', 'unmapped_but_in_923', 'unmapped' ], axis=1) bga_out = bga_out.drop_duplicates(subset=[ 'report_date', 'plant_id_eia', 'boiler_id', 'generator_id' ]) eia_transformed_dfs['boiler_generator_assn_eia860'] = bga_out return eia_transformed_dfs
print(df_nodes.head()) print(df_nodes.describe()) #check edges df print(df_edges.head()) print(df_edges.describe()) #merge nodes and edges df df_complete = pd.concat([df_nodes, df_edges], axis=1) print(df_complete.head()) #create a networkx directional graph G = nx.from_pandas_edgelist( df=df_complete, source="fromUser", #fieldname of django qs target="toUser", #fieldname of django qs edge_attr=["amount"], #edge weights from django qs create_using=nx.DiGraph #type of graph (here directional) ) #inspect graph object print(nx.info(G)) #check if edge metadata is added correctly print(list(G.edges(data=True))[0:5]) #add node metadata #nx.set_node_attributes(G, df_nodes["followerCount"], "followerCount") for node, metadata in df_nodes.set_index("username").iterrows(): for key, val in metadata.items(): #treat df features as dict G.nodes[node][key] = val print(list(G.nodes(data=True))[0:5])
def centrality_scores(): """ Calculating and plotting centrality scores for the FULL Georgia Reply Network """ # Retrieving the FULL Georgia Reply Network: (created in more_complicated_reply_network()) df = pd.read_csv(m4r_data + "ga_reply_network_full.csv") # Converting this to a networkx graph object: G = nx.from_pandas_edgelist(df, "Source", "Target", ["Weight"], create_using=nx.DiGraph()) # Calculating centrality scores: in_central = nx.algorithms.centrality.in_degree_centrality(G) # In-Degree out_central = nx.algorithms.centrality.out_degree_centrality( G) # Out-Degree p_central = nx.algorithms.link_analysis.pagerank_alg.pagerank( G) # PageRank with alpha = 0.85 # Inserting the scores into a single dataframe:s d1 = pd.DataFrame().from_dict(p_central, orient="index", columns=["PageRank"]).reset_index() d2 = pd.DataFrame().from_dict(in_central, orient="index", columns=["In-Degree"]).reset_index() d3 = pd.DataFrame().from_dict(out_central, orient="index", columns=["Out-Degree"]).reset_index() centrality_df = ((d1.merge(d2, on="index")).merge(d3, on="index")).rename( {"index": "user.id"}, axis=1) # Now retrieving the Louvain community for each account (if the account is in community 8 or 42) gephi = pd.read_csv(m4r_data + "ga_reply_network_truncated_louvain_communities.csv")[[ "Id", "modularity_class" ]].rename( { "Id": "user.id", "modularity_class": "Community" }, axis=1) gephi = gephi[gephi["Community"].isin( [8, 42] )] # Only care about Community labels for nodes in Communities 8 or 42 (the largest communities) # Now retrieving the class (bot or human label) users = pickle.load( open(m4r_data + "us_and_georgia_accounts.p", "rb"))[["user.id", "user.screen_name", "predicted_class"]] # Adding account class and community labels to the centrality score dataframe centrality_df = centrality_df.merge(users[["user.id", "predicted_class"]], on="user.id", how="left").rename( {"predicted_class": "Class"}, axis=1) centrality_df = centrality_df.fillna("Unknown") centrality_df = centrality_df.merge(gephi[["user.id", "Community"]], how="left", on="user.id") centrality_df = (centrality_df.fillna("Other")) centrality_df = centrality_df.merge(users[["user.id", "user.screen_name"]], on="user.id", how="left") centrality_df["Class"] = centrality_df["Class"].replace({ "human": "Human", "bot": "Bot" }) centrality_df["Community"] = centrality_df["Community"].replace({ 8: "Group 8", 42: "Group 42" }) centrality_df = centrality_df.sort_values(["Class", "Community"], ascending=False) # Node user ids with the 7 highest in degrees populars = list( centrality_df.sort_values("In-Degree", ascending=False)["user.id"].iloc[:7]) # Plotting the centrality scores against each other... pal = [ sns.color_palette("tab10")[7], sns.color_palette("tab10")[0], sns.color_palette("tab10")[1] ] fig, axes = plt.subplots(1, 2, figsize=(8, 3.1), sharey=True) fig.suptitle('Comparing Centrality Measures for the Georgia Reply Network', fontweight="bold") # In degree vs Out degree sns.scatterplot( ax=axes[0], data=centrality_df[centrality_df["user.id"].isin(populars) == False], y="In-Degree", x="Out-Degree", hue="Class", s=120, alpha=0.9, palette=pal, style="Community") # In degree vs PageRank sns.scatterplot( ax=axes[1], data=centrality_df[centrality_df["user.id"].isin(populars) == False], y="In-Degree", x="PageRank", hue="Class", s=120, alpha=0.9, palette=pal, style="Community") # Legend handles, labels = axes[0].get_legend_handles_labels() new_handles = [handles[i] for i in [0, 2, 3, 1, 4, 6, 7, 5]] new_labels = [labels[i] for i in [0, 2, 3, 1, 4, 6, 7, 5]] axes[0].legend([], [], frameon=False) axes[1].legend([], [], frameon=False) fig.legend(new_handles, new_labels, loc="center left", bbox_to_anchor=[0.67, 0.5]) # Names of axes axes[0].set_ylabel("In-Degree Centrality", fontweight="bold") #axes[1].set_ylabel("In-Degree Centrality", fontweight = "bold") axes[0].set_xlabel("Out-Degree Centrality", fontweight="bold") axes[1].set_xlabel("PageRank Centrality", fontweight="bold") # Adjusting plot size to accommodate legend: right determines how much space is left for the legend - e.g. right = 0.8 leaves 80% of space for legend plt.subplots_adjust(right=0.69, wspace=0.04, hspace=0.1) #plt.savefig(figure_path + "ga_centrality_measures.pdf", bbox_inches = "tight") plt.show()
for col in df2.columns: logReturns2[col] = np.log(df2[col]).diff(-1) # ============================================================================= # edge, nodes准备 # ============================================================================= corrMatrix2 = logReturns2.corr() edges2 = corrMatrix2.stack().reset_index() edges2.columns = ['theOne', 'theOther', 'correlation'] # remove self correlations # list, 含 pairwise correlation信息 edges2 = edges2.loc[edges2['theOne'] != edges2['theOther']].copy() # undirected graph with weights corresponding to the correlation magnitude G2 = nx.from_pandas_edgelist(edges2, 'theOne', 'theOther', edge_attr=['correlation']) print(nx.info(G2)) #%% def get_density(G): # How many possible edges? possible_edges = len(G.nodes) * (len(G.nodes) - 1) / 2 actual_edges = len(G.edges) return actual_edges / possible_edges print('density: ', get_density(G2)) print('node connectivity: ', nx.node_connectivity(G2))
cross = series.apply(lambda x: list(itertools.combinations(x, 2))) lists = [item for sublist in cross for item in sublist] source = [i[0] for i in lists] target = [i[1] for i in lists] edges = pd.DataFrame({"source": source, "target": target}) edges["weight"] = 1 return edges.groupby(by=["source", "target"], as_index=False)["weight"].sum() df_edges = get_edges(data=df, column="CPC Class - DWPI") g = nx.from_pandas_edgelist(df_edges, source="source", target="target", edge_attr=["weight"], create_using=nx.Graph) df = df_edges clubs = list(df.source.unique()) people = list(df.target.unique()) dict(zip(clubs, clubs)) plt.figure(figsize=(12, 12)) # 1. Create the graph
# tools for creating a graph from a pandas dataframe import networkx as nx # Create graph edges first col_node_1 = 'Person_ID' col_node_2 = 'Company_ID' col_attr = ['start_date', 'end_date'] G = nx.from_pandas_edgelist(df = df, source=col_node_1, target=col_node_2, edge_attr=col_attr) # OR G = nx.from_pandas_edgelist(df = df.assign(has_worked=1), source=col_node_1, target=col_node_2, edge_attr=col_attr+['has_worked']) # Add node labels list_person_1 = [] # list of person with special type G.add_nodes_from(df.loc[(df[col_node_1].isin(list_person_1)), col_node_1].unique().tolist(), label='person_type_1') G.add_nodes_from(df.loc[~(df[col_node_1].isin(list_person_1)), col_node_1].unique().tolist(), label='person_type_2') G.add_nodes_from(df[col_node_2].unique().tolist(), label='Company') # Create links for people with same email and same telephone # There is only one variable for email col_edge = 'email' emails = df[[col_node_1, col_edge]].dropna(subset=[col_edge]).drop_duplicates()
network_data=network_data.append(friend_frame) # In[ ]: network_data=network_data.reset_index(drop=True) #checks network_data.tail() # In[ ]: #changing the column name to suit nx import network_data.columns=['source','target'] # Considering each (user_id,friend) pair as an edge of a graph, constructing the graph graph=nx.from_pandas_edgelist(network_data) # logging time end_time=time.time() print("Took",end_time-start_time,"s") # In[ ]: #credits https://www.kaggle.com/crailtap/basic-network-analysis-tutorial #basic info nx.info(graph) # In[ ]: #check density
def train( edgelist, node_data, layer_size, num_samples, batch_size=100, num_epochs=10, learning_rate=0.005, dropout=0.0, target_name="subject", ): """ Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes layer_size: A list of number of hidden nodes in each layer num_samples: Number of neighbours to sample at each layer batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records") ) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist) # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets ) # Split test set into test and validation val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=None ) # Create mappers for GraphSAGE that input data from the graph to the model generator = GraphSAGENodeGenerator( G, batch_size, num_samples, seed=42 ) train_gen = generator.flow(train_nodes, train_targets) val_gen = generator.flow(val_nodes, val_targets) # GraphSAGE model model = GraphSAGE( layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout ) # Expose the input and output sockets of the model: x_inp, x_out = model.default_model(flatten_output=True) # Snap the final estimator layer to x_out prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate), loss=losses.categorical_crossentropy, metrics=[metrics.categorical_accuracy], ) # Train model history = model.fit_generator( train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=True, ) # Evaluate on test set and print metrics test_metrics = model.evaluate_generator(generator.flow(test_nodes, test_targets)) print("\nTest Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes all_predictions = model.predict_generator(generator.flow(node_ids)) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=node_ids ) accuracy = np.mean( [ "subject=" + gt_subject == p for gt_subject, p in zip( node_data["subject"], node_predictions.idxmax(axis=1) ) ] ) print("All-node accuracy: {:3f}".format(accuracy)) # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("cora_example_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_example_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)
ic_ltp = 0 ic_lta = 0 ltp_lta = 0 count_samples = 0 print(nx.__version__) #parameters of the script path = '../data/icpsr/DS0001/paluck-edgelist.csv' globalThreshold = 1.5 activation_ci = 0.075 cascadeParameter = 0.5 edgelist = pd.read_csv(path) G = nx.from_pandas_edgelist(edgelist, source='ID', target='PEERID') print(nx.info(G)) print(nx.number_of_nodes(G)) print(f'connected?\t{nx.is_connected(G)}') print(f'# of connected components:\t{nx.number_connected_components(G)}') components = nx.connected_components(G) sglist = [G.subgraph(c) for c in nx.connected_components(G)] gmat = [] for g in sglist: gmat.append(nx.to_numpy_matrix(g, dtype=np.float)) graph_size_list = [nx.number_of_nodes(g) for g in sglist] graph_size_series = pd.Series(graph_size_list) ordered_graph_series = graph_size_series.sort_values()
for word in most_similar: # Find stats for this word word_dice_stats = dice_significance(cooc, word, key_words) word_dice_stats = dict_value_sort(word_dice_stats) # Choose top nearby matches top_neighbours = list(word_dice_stats.keys())[0:10] layer2_names += top_neighbours new_graph_data = [{"from":word.upper(), "to":set_name, "stat":word_dice_stats[set_name]} for set_name in top_neighbours] # Add to existing graph data graph_data += new_graph_data # Convert graph data to pandas dataframe gd = pd.DataFrame.from_dict(graph_data) # Create co-occurance graph # G = nx.from_numpy_matrix(cooc) G = nx.from_pandas_edgelist(gd, "from", "to", "stat") # Generate colours colours, sizes = [], [] l0, l1, l2 = {}, {}, {} for node in G: if node in ALL_SEARCH_TERMS.upper(): col = 'darkblue' #'red' size = counts[node]*1000 #5000 l0[node] = node elif node in layer1_names: col = 'lightblue' #'orange' size = counts[node]*1000 #2500 l1[node] = node else: col = 'cyan' #'blue'
def download_signor(): col_names = [ 'ENTITYA', 'TYPEA', 'IDA', 'DATABASEA', 'ENTITYB', 'TYPEB', 'IDB', 'DATABASEB', 'EFFECT', 'MECHANISM', 'RESIDUE', 'SEQUENCE', 'TAX_ID', 'CELL_DATA', 'TISSUE_DATA', 'MODULATOR_COMPLEX', 'TARGET_COMPLEX', 'MODIFICATIONA', 'MODASEQ', 'MODIFICATIONB', 'MODBSEQ', 'PMID', 'DIRECT', 'SENTENCE', 'SIGNOR_ID', 'NA1', 'NA2', 'NA3'] table = pd.read_csv('https://signor.uniroma2.it/getData.php?organism=9606', names=col_names, delimiter='\t', index_col=None, error_bad_lines=False, encoding='utf-8' ) # filter out non direct table = table.loc[table['DIRECT'] == 't'] # Filter out non descriptive table = table.loc[~table['MECHANISM'].isnull()] # Drop SIGNOR edges, these are generally complexes table = table[~(table['DATABASEA'] == 'SIGNOR')] table = table[~(table['DATABASEB'] == 'SIGNOR')] # Not sure what they mean, so will remove. Ideally other DBs have this info table = table[~(table['MECHANISM'] == 'post transcriptional regulation')] col_a = ['ENTITYA', 'TYPEA', 'IDA', 'DATABASEA'] col_b = ['ENTITYB', 'TYPEB', 'IDB', 'DATABASEB'] cols = ['name', 'species_type', 'id', 'db'] species_a = table[col_a].copy() species_b = table[col_b].copy() species_a.rename(columns={i: j for i, j in zip(col_a, cols)}, inplace=True) species_b.rename(columns={i: j for i, j in zip(col_b, cols)}, inplace=True) species_a.drop_duplicates(inplace=True) species_b.drop_duplicates(inplace=True) all_species = pd.concat([species_a, species_b]) all_species.drop_duplicates(inplace=True) def map_to_activate_inhibit(row): effect = '' mechanism = row['MECHANISM'] if 'down-regulates' in row['EFFECT']: effect = 'inhibit' elif 'up-regulates' in row['EFFECT']: effect = 'activate' if mechanism in edge_standards: mechanism = edge_standards[mechanism] elif mechanism == 'transcriptional regulation': if effect == 'inhibit': mechanism = 'repression' elif effect == 'activate': mechanism = 'expression' if effect == '': return mechanism else: return "|".join([effect, mechanism]) # relabel edge types table['interactionType'] = table.apply(map_to_activate_inhibit, axis=1) table['databaseSource'] = 'SIGNOR' table['pmid'] = table['PMID'] table['source'] = table['ENTITYA'] table['target'] = table['ENTITYB'] protein_graph = nx.from_pandas_edgelist( table, 'source', 'target', edge_attr=['interactionType', 'databaseSource'], create_using=nx.DiGraph() ) # add names to graph for row in all_species.values: name, species_type, id_name, db = row if species_type != 'protein': species_type = 'compound' if species_type == 'protein': species_type = 'gene' protein_graph.add_node(name, databaseSource='SIGNOR', speciesType=species_type) nx.write_gpickle(protein_graph, _p_name)
elast = elast[elast.direction != "zero"] elast.effector = elast.effector.str.replace("EX_", "").str.replace("_m", "") for sa in samples: sns.kdeplot(elast[elast.id == sa].elasticity, bw=4, shade=True, label=sa) plt.legend() plt.xlabel("elasticity [a.u.]") plt.ylabel("density") plt.savefig("elast_densities.svg") plt.close() for sa in samples: e = elast[elast.id == sa].copy() e = e[(e.elasticity.abs() > 0.5) & (e.norm_elasticity.abs() > 0.5)] graph = nx.from_pandas_edgelist(e, source="effector", target="reaction", edge_attr="elasticity") for idx, _ in graph.nodes(data=True): if idx.startswith("EX_"): d = direction(e, idx) cl = "import flux" if d == "forward" else "export flux" elif idx[0].isupper(): cl = "abundance" else: cl = "diet" graph.node[idx]["class"] = cl circos = nxviz.CircosPlot(graph, node_labels=True, rotate_labels=True, edge_color="elasticity", edge_cmap="bwr", edge_limits=(-150, 150), node_color="class", node_grouping="class", node_order="class", figsize=(20, 18))
def process(self): if not all(osp.exists(path) for path in self.processed_paths): helpers.log("Processing") names = ["Users", "Items", self.timeattribute] data = read_interaction(self.raw_paths.interactions, names=names) # Relabel users and items ( # user_ids : 1, ..., len(users) + 1 # item_ids : len(users), ..., len(users) + len(items) + 1) users = sorted(data.Users.unique()) items = sorted(data.Items.unique()) helpers.log(f"Number of users {len(users)}") helpers.log(f"Number of items {len(items)}") helpers.log(f"Number of nodes {len(users) + len(items)}") user_ids = range(1, len(users) + 1) item_ids = range(max(user_ids) + 1, max(user_ids) + len(items) + 1) user2id = dict(zip(users, user_ids)) item2id = dict(zip(items, item_ids)) data.Users = data.Users.apply(lambda l: user2id[l]) data.Items = data.Items.apply(lambda l: item2id[l]) self.users = sorted(data.Users.unique()) self.items = sorted(data.Items.unique()) # Create an interaction graph interaction_graph = nx.from_pandas_edgelist( data[names], source="Users", target="Items", create_using=nx.MultiDiGraph, edge_attr=self.timeattribute, ) interaction_graph.add_nodes_from(self.users + self.items) # Initialize complete interaction history self.init_neighborhood(interaction_graph=interaction_graph) # Convert interaction graph to list of interactions sorted by time interactions = sorted( interaction_graph.edges(data=True), key=lambda l: l[2][self.timeattribute], ) # identify the indices for train, valid, and test splits train_size = int(data.shape[0] * self.train_rate) valid_size = int(data.shape[0] * self.valid_rate) train_indices = range(train_size) valid_indices = range(train_size, train_size + valid_size) test_indices = range(train_size + valid_size, data.shape[0]) torch.save( (interactions, self.neighborhood, self.timestamps), self.processed_paths.interactions) torch.save((self.users, self.items), self.processed_paths.nodes) torch.save( (train_indices, valid_indices, test_indices), self.processed_paths.splits) helpers.log("Done!") self.interactions, self.neighborhood, self.timestamps = torch.load( self.processed_paths.interactions ) self.users, self.items = torch.load(self.processed_paths.nodes) self.train_split, self.valid_split, self.test_split = torch.load( self.processed_paths.splits ) self.__indices__ = range(len(self.interactions))
return res_dct #run and append @Dataset for each node the max node CC with the value CC ccNodeVal(G,cc) #Now we can create the initial popoulation is the node and ccNodeValq`1 #neibors is the gen we can do matuation and new #these are neighbors = {node: list(G.neighbors(node)) for node in G.nodes()} nodes = list(G.nodes()) #graph node and mmCCnode #graph node and mmCCnode G2 = nx.from_pandas_edgelist(cc, 'node', 'mmCCNode') # find the conencted componets of the new graph and is the chromosome #the initial popoulation with2 is concomp concomp=list(list(nx.connected_components(G2))) numCluster=len(concomp) nx.draw_networkx(G2) plt.show() #communities = {node: community for community, node in enumerate(neighbors.keys())} from quality import modularity #modularity for all chromosume mod1=modularity(G2,concomp)
df_events_attendance = pd.read_csv(EVENTS_ATTENDANCE_CLEANED) # counting the number events attended per individual - degree - attribute df_degree = df_events_attendance.groupby([SURNAME, FULLNAME], as_index=False).count() # change name of column header df_degree.rename(columns={EVENT_ID: "eventscount"}, inplace=True) # get family size df_family = df_degree[[SURNAME, FULLNAME]].groupby(SURNAME, as_index=False).count() # color - family size. 1 is a single individual df_family.rename(columns={FULLNAME: "familysize"}, inplace=True) #df_family[df_family["color"] > 1] # maybe not df_degree = df_degree.merge(df_family, on = [SURNAME]) # create graph object from pd df_events_attendanceframe B = nx.from_pandas_edgelist(df=df_events_attendance, source=SURNAME, target=EVENT_ID) # get the two bipartite sets - x will be the indivuals, y the events X, Y = nx.bipartite.sets(B) # convert the bipartite graph to a weighted graph of common participation to an event G = nx.algorithms.bipartite.weighted_projected_graph(B, X) # set number of events attended as node attribute df_degree = df_degree.groupby(SURNAME, as_index=False).sum() df_degree.set_index(SURNAME, inplace=True) nx.set_node_attributes( G, pd.Series(df_degree["eventscount"], index=df_degree.index).to_dict(), "eventscount") # set family size as attribute df_family.set_index(SURNAME, inplace=True) nx.set_node_attributes(
import pandas as pd import networkx as nx df = pd.read_csv("Data/waiting_list.csv") df.sort_values("Datetime", inplace=True) G = nx.from_pandas_edgelist(df, "From", "To", edge_attr=True, create_using=nx.MultiDiGraph) students = df.shape[0] # Returns candidate swapchains swaps, chain_no = [], 1 for cycle in list(nx.simple_cycles(G)): n = len(cycle) step = 1 for u, v in zip(cycle, cycle[1:] + [cycle[0]]): for i in range(students): if df.iloc[i,1] == u and df.iloc[i,2] == v: swaps.append({"chain_no": chain_no, "chain_size": n, "Step": step, "Student": df.iloc[i,0], "From": u, "To": v}) step +=1 break chain_no += 1 # Sort and filter output csv output = pd.DataFrame(swaps) output.sort_values(["chain_size", "chain_no", "Step"],
def to_networkx_graph(data, create_using=None, multigraph_input=False): """Make a NetworkX graph from a known data structure. The preferred way to call this is automatically from the class constructor >>> d = {0: {1: {'weight':1}}} # dict-of-dicts single edge (0,1) >>> G = nx.Graph(d) instead of the equivalent >>> G = nx.from_dict_of_dicts(d) Parameters ---------- data : object to be converted Current known types are: any NetworkX graph dict-of-dicts dict-of-lists list of edges Pandas DataFrame (row per edge) numpy matrix numpy ndarray scipy sparse matrix pygraphviz agraph create_using : NetworkX graph constructor, optional (default=nx.Graph) Graph type to create. If graph instance, then cleared before populated. multigraph_input : bool (default False) If True and data is a dict_of_dicts, try to create a multigraph assuming dict_of_dict_of_lists. If data and create_using are both multigraphs then create a multigraph from a multigraph. """ # NX graph if hasattr(data, "adj"): try: result = from_dict_of_dicts(data.adj, create_using=create_using, multigraph_input=data.is_multigraph()) if hasattr(data, 'graph'): # data.graph should be dict-like result.graph.update(data.graph) if hasattr(data, 'nodes'): # data.nodes should be dict-like result._node.update((n, dd.copy()) for n, dd in data.nodes.items()) return result except: raise nx.NetworkXError("Input is not a correct NetworkX graph.") # pygraphviz agraph if hasattr(data, "is_strict"): try: return nx.nx_agraph.from_agraph(data, create_using=create_using) except: raise nx.NetworkXError("Input is not a correct pygraphviz graph.") # dict of dicts/lists if isinstance(data, dict): try: return from_dict_of_dicts(data, create_using=create_using, multigraph_input=multigraph_input) except: try: return from_dict_of_lists(data, create_using=create_using) except: raise TypeError("Input is not known type.") # list or generator of edges if (isinstance(data, (list, tuple)) or any(hasattr(data, attr) for attr in ['_adjdict', 'next', '__next__'])): try: return from_edgelist(data, create_using=create_using) except: raise nx.NetworkXError("Input is not a valid edge list") # Pandas DataFrame try: import pandas as pd if isinstance(data, pd.DataFrame): if data.shape[0] == data.shape[1]: try: return nx.from_pandas_adjacency(data, create_using=create_using) except: msg = "Input is not a correct Pandas DataFrame adjacency matrix." raise nx.NetworkXError(msg) else: try: return nx.from_pandas_edgelist(data, edge_attr=True, create_using=create_using) except: msg = "Input is not a correct Pandas DataFrame edge-list." raise nx.NetworkXError(msg) except ImportError: msg = 'pandas not found, skipping conversion test.' warnings.warn(msg, ImportWarning) # numpy matrix or ndarray try: import numpy if isinstance(data, (numpy.matrix, numpy.ndarray)): try: return nx.from_numpy_matrix(data, create_using=create_using) except: raise nx.NetworkXError( "Input is not a correct numpy matrix or array.") except ImportError: warnings.warn('numpy not found, skipping conversion test.', ImportWarning) # scipy sparse matrix - any format try: import scipy if hasattr(data, "format"): try: return nx.from_scipy_sparse_matrix(data, create_using=create_using) except: raise nx.NetworkXError( "Input is not a correct scipy sparse matrix type.") except ImportError: warnings.warn('scipy not found, skipping conversion test.', ImportWarning) raise nx.NetworkXError( "Input is not a known data type for conversion.")
def test_from_edgelist_one_attr(self): Gtrue = nx.Graph([('E', 'C', {'weight': 10}), ('B', 'A', {'weight': 7}), ('A', 'D', {'weight': 4})]) G = nx.from_pandas_edgelist(self.df, 0, 'b', 'weight') assert_graphs_equal(G, Gtrue)
import pandas as pd import networkx as nx wd = '/Users/ewenwang/Documents/practice_data/conversion_rate/' file = ['round1_ijcai_18_train_20180301.txt', 'round1_ijcai_18_test_a_20180301.txt', 'round1_ijcai_18_test_b_20180418.txt'] print('loading...') train = pd.read_csv(wd+file[0], sep=" ") test_a = pd.read_csv(wd+file[1], sep=" ") test_b = pd.read_csv(wd+file[2], sep=" ") data = pd.concat([train, test_a, test_b]) print('graph generating...') G_ui = nx.from_pandas_edgelist(df=data, source='user_id', target='item_id', edge_attr='is_trade', create_using=nx.MultiGraph()) pagerank = pd.DataFrame(list(nx.pagerank(G_ui).items()), columns=['node', 'pagerank']) print('merging...') data = data.merge(pagerank, left_on='user_id', right_on='node', how='left').merge(pagerank, left_on='item_id', right_on='node', how='left') pagerank_data = pd.DataFrame(columns=['instance_id', 'user_pagerank', 'item_pagerank']) pagerank_data['instance_id'] = data['instance_id'] pagerank_data['user_pagerank'] = data['pagerank_x'] pagerank_data['item_pagerank'] = data['pagerank_y'] print('saving...') pagerank_data.to_csv(wd+'pagerank_union.txt', index=False, sep=' ')
def build_network(df, edges, nodes): ##### BUILDING NETWORK ###### column_edge = edges column_ID = nodes data_to_merge = df[[column_ID, column_edge]].dropna( subset=[column_edge]).drop_duplicates() # select columns, remove NaN #data_to_merge = df[[column_ID, column_edge]] # To create connections between people who have the same number, # join data with itself on the 'ID' column. data_to_merge = data_to_merge.merge(data_to_merge[[ column_ID, column_edge ]].rename(columns={column_ID: column_ID + "_2"}), on=column_edge) # By joining the data with itself, people will have a connection with themselves. # Remove self connections, to keep only connected people who are different. d = data_to_merge[~(data_to_merge[column_ID]==data_to_merge[column_ID+"_2"])] \ .dropna()[[column_ID, column_ID+"_2", column_edge]] # To avoid counting twice the connections (person 1 connected to person 2 and person 2 connected to person 1) # we force the first ID to be "lower" then ID_2 d.drop(d.loc[d[column_ID + "_2"] < d[column_ID]].index.tolist(), inplace=True) print('pre pro done...') ######################### G = nx.from_pandas_edgelist(df=d, source=column_ID, target=column_ID + '_2', edge_attr=column_edge) G.add_nodes_from(nodes_for_adding=df.class_name.tolist()) #G.add_nodes_from(nodes_for_adding=list(df[nodes].values())) print('#nodes:', len(G.nodes()), 'and', '#edges:', len(G.edges())) degrees = [val for (node, val) in G.degree()] np.save("data/degrees.npy", degrees) degree_values = sorted(set(degrees)) histogram = [ degrees.count(i) / float(nx.number_of_nodes(G)) for i in degree_values ] fig, ax = plt.subplots() # the histogram of the data n, bins, patches = plt.hist(degrees, 50) #plt.bar(range(len(histogram)),histogram) #plt.xticks(range(len(histogram)), degree_values) plt.xlabel('Degree') plt.ylabel('Fraction of Nodes') plt.xlim(0, max(degree_values)) #plt.xscale('log') plt.yscale('log') plt.tight_layout() plt.savefig("plot/nodes_degress.pdf") fig, ax = plt.subplots() plt.plot(range(len(histogram)), sorted(histogram, reverse=True), 'o') plt.xticks(range(len(histogram)), degree_values) plt.xlabel('Degree') plt.ylabel('Fraction of Nodes') plt.xscale('log') plt.yscale('log') plt.tight_layout() plt.savefig("plot/power_law.pdf") closeness_centrality = nx.closeness_centrality(G) #print(closeness_centrality) write_dict(closeness_centrality, "data/closeness_centrality.csv") print('closeness done...') betweenness_centrality = nx.betweenness_centrality(G) #print(betweenness_centrality) write_dict(betweenness_centrality, "data/betweenness_centrality.csv") print('betweenness done...') degree_centrality = nx.degree_centrality(G) #print(degree_centrality) write_dict(degree_centrality, "data/degree_centrality.csv") print('centrality done...') fig, ax = plt.subplots() # the histogram of the data nx.draw(G) #plt.bar(range(len(histogram)),histogram) #plt.xticks(range(len(histogram)), degree_values) plt.tight_layout() #plt.show() plt.savefig("plot/net.pdf")
def makeGraph(request, df_enron): G = networkx.from_pandas_edgelist(df_enron, 'fromId', 'toId', edge_attr=True) di = { 'CEO': 1, 'Director': 2, 'Employee': 3, 'In House Lawyer': 4, 'Manager': 5, 'Managing Director': 6, 'President': 7, 'Trader': 8, 'Unknown': 9, 'Vice President': 10 } df_rejob = df_enron.replace({"fromJobtitle": di}) df_attributes = df_enron[['fromId', 'fromJobtitle', 'fromEmail']].drop_duplicates() df_attributes.columns = ['fromId', 'job', 'fromEmail'] df_attributesx = df_rejob[['fromId', 'fromJobtitle', 'fromEmail']].drop_duplicates() job = df_attributes.set_index('fromId').to_dict('i') jobx = df_attributesx.set_index('fromId').to_dict('i') fromEmail = df_attributes.set_index('fromEmail').to_dict('i') networkx.set_node_attributes(G, job) networkx.set_node_attributes(G, jobx) networkx.set_node_attributes(G, fromEmail) #jobs = ['Employee','Vice President','Unknown','Manager','CEO','Trader','Director','President','Managing Director','In House Lawyer'] degrees = dict(networkx.degree(G)) networkx.set_node_attributes(G, name='degree', values=degrees) adjusted_node_size = dict([(node, (degree + 5) - ((degree + 5) * 0.3)) for node, degree in networkx.degree(G)]) networkx.set_node_attributes(G, name='adjusted_node_size', values=adjusted_node_size) size_by_this_attribute = 'adjusted_node_size' color_by_this_attribute = 'fromJobtitle' color_palette = Category10[10] TOOLTIPS = [ ("Person ID", "@index"), ("Email", "@fromEmail"), ("people communicated with", "@degree"), ("Jobtitle", "@job"), ] graph_size = int(request.POST.get('graph_size', '720')) plot = figure(tooltips=TOOLTIPS, tools="pan,zoom_in,wheel_zoom,save,reset,box_select,undo", active_scroll='wheel_zoom', x_range=Range1d(-20, 20), y_range=Range1d(-20, 20), title='Enron Emails', plot_width=graph_size, plot_height=graph_size) plot.axis.visible = False N_graph = from_networkx(G, networkx.spring_layout, scale=100) N_graph.node_renderer.glyph = Circle(size=size_by_this_attribute, fill_color=linear_cmap( color_by_this_attribute, color_palette, 1, 10)) N_graph.edge_renderer.glyph = MultiLine(line_alpha=10, line_width=1) plot.renderers.append(N_graph) item_text = json.dumps(json_item(plot)) return item_text
def fullSizeGraph(request): import pandas as pd import networkx import matplotlib.pyplot as plt import numpy as np df_enron = filterDataByTime(request, pd.read_csv(request.FILES['csv_data'])) #from bokeh.io import output_notebook, show, save from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine from bokeh.plotting import figure from bokeh.models.graphs import from_networkx from bokeh.palettes import Category10 from bokeh.transform import linear_cmap from bokeh.embed import json_item #output_notebook() #remove this when not using notebook G = networkx.from_pandas_edgelist(df_enron, 'fromId', 'toId', edge_attr=True) di = { 'CEO': 1, 'Director': 2, 'Employee': 3, 'In House Lawyer': 4, 'Manager': 5, 'Managing Director': 6, 'President': 7, 'Trader': 8, 'Unknown': 9, 'Vice President': 10 } df_rejob = df_enron.replace({"fromJobtitle": di}) df_attributes = df_enron[['fromId', 'fromJobtitle']].drop_duplicates() df_attributes.columns = ['fromId', 'job'] df_attributesx = df_rejob[['fromId', 'fromJobtitle']].drop_duplicates() job = df_attributes.set_index('fromId').to_dict('i') jobx = df_attributesx.set_index('fromId').to_dict('i') networkx.set_node_attributes(G, job) networkx.set_node_attributes(G, jobx) #jobs = ['Employee','Vice President','Unknown','Manager','CEO','Trader','Director','President','Managing Director','In House Lawyer'] degrees = dict(networkx.degree(G)) networkx.set_node_attributes(G, name='degree', values=degrees) adjusted_node_size = dict([(node, (degree + 5) - ((degree + 5) * 0.3)) for node, degree in networkx.degree(G)]) networkx.set_node_attributes(G, name='adjusted_node_size', values=adjusted_node_size) size_by_this_attribute = 'adjusted_node_size' color_by_this_attribute = 'fromJobtitle' color_palette = Category10[10] TOOLTIPS = [ ("Person ID", "@index"), ("people communicated with", "@degree"), ("Jobtitle", "@job"), ] plot = figure(tooltips=TOOLTIPS, tools="pan,zoom_in,wheel_zoom,save,reset,box_select,undo", active_scroll='wheel_zoom', x_range=Range1d(-20, 20), y_range=Range1d(-20, 20), title='Enron Emails', plot_width=950, plot_height=950) plot.axis.visible = False N_graph = from_networkx(G, networkx.spring_layout, scale=100) N_graph.node_renderer.glyph = Circle(size=size_by_this_attribute, fill_color=linear_cmap( color_by_this_attribute, color_palette, 1, 10)) N_graph.edge_renderer.glyph = MultiLine(line_alpha=10, line_width=1) plot.renderers.append(N_graph) item_text = json.dumps(json_item(plot)) return django.http.JsonResponse(item_text, safe=False)
for k, v in vt_copy.items(): if len(v) == 0: del visualisation_table[k] network_table = {} i = 0 for k, v in visualisation_table.items(): for c_to_c in v: network_table[i] = c_to_c i += 1 for k, v in network_table.items(): network_table[k] = sorted(v) network_df = pd.DataFrame(network_table) network_df = network_df.transpose() network_df.columns = ['c1', 'c2'] plt.figure(figsize=(12, 12)) g = nx.from_pandas_edgelist(network_df, source='c1', target='c2') nx.draw_networkx(g) plt.show() #abstract abstract_words = data_13_20['Abstract'] abs_dict = {} for i, words in enumerate(abstract_words): if type(words) == str: words = words.replace(';', '') words = stop_word(words, stop_words) abs_dict[i] = words else: pass #keywords
def network_graph(yearRange, AccountToSearch): edge1 = pd.read_csv('edge1.csv') node1 = pd.read_csv('node1.csv') # filter the record by datetime, to enable interactive control through the input box edge1['Datetime'] = "" # add empty Datetime column to edge1 dataframe accountSet = set() # contain unique account for index in range(0, len(edge1)): edge1['Datetime'][index] = datetime.strptime(edge1['Date'][index], '%d/%m/%Y') if edge1['Datetime'][index].year < yearRange[0] or edge1['Datetime'][ index].year > yearRange[1]: edge1.drop(axis=0, index=index, inplace=True) continue accountSet.add(edge1['Source'][index]) accountSet.add(edge1['Target'][index]) # to define the centric point of the networkx layout shells = [] shell1 = [] shell1.append(AccountToSearch) shells.append(shell1) shell2 = [] for ele in accountSet: if ele != AccountToSearch: shell2.append(ele) shells.append(shell2) G = nx.from_pandas_edgelist(edge1, 'Source', 'Target', ['Source', 'Target', 'TransactionAmt', 'Date'], create_using=nx.MultiDiGraph()) nx.set_node_attributes( G, node1.set_index('Account')['CustomerName'].to_dict(), 'CustomerName') nx.set_node_attributes(G, node1.set_index('Account')['Type'].to_dict(), 'Type') # pos = nx.layout.spring_layout(G) # pos = nx.layout.circular_layout(G) # nx.layout.shell_layout only works for more than 3 nodes if len(shell2) > 1: pos = nx.drawing.layout.shell_layout(G, shells) else: pos = nx.drawing.layout.spring_layout(G) for node in G.nodes: G.nodes[node]['pos'] = list(pos[node]) if len(shell2) == 0: traceRecode = [] # contains edge_trace, node_trace, middle_node_trace node_trace = go.Scatter(x=tuple([1]), y=tuple([1]), text=tuple([str(AccountToSearch)]), textposition="bottom center", mode='markers+text', marker={ 'size': 50, 'color': 'LightSkyBlue' }) traceRecode.append(node_trace) node_trace1 = go.Scatter(x=tuple([1]), y=tuple([1]), mode='markers', marker={ 'size': 50, 'color': 'LightSkyBlue' }, opacity=0) traceRecode.append(node_trace1) figure = { "data": traceRecode, "layout": go.Layout(title='Interactive Transaction Visualization', showlegend=False, margin={ 'b': 40, 'l': 40, 'r': 40, 't': 40 }, xaxis={ 'showgrid': False, 'zeroline': False, 'showticklabels': False }, yaxis={ 'showgrid': False, 'zeroline': False, 'showticklabels': False }, height=600) } return figure traceRecode = [] # contains edge_trace, node_trace, middle_node_trace ############################################################################################################################################################ colors = list( Color('lightcoral').range_to(Color('darkred'), len(G.edges()))) colors = ['rgb' + str(x.rgb) for x in colors] index = 0 for edge in G.edges: x0, y0 = G.nodes[edge[0]]['pos'] x1, y1 = G.nodes[edge[1]]['pos'] weight = float(G.edges[edge]['TransactionAmt']) / max( edge1['TransactionAmt']) * 10 trace = go.Scatter(x=tuple([x0, x1, None]), y=tuple([y0, y1, None]), mode='lines', line={'width': weight}, marker=dict(color=colors[index]), line_shape='spline', opacity=1) traceRecode.append(trace) index = index + 1 ############################################################################################################################################################### node_trace = go.Scatter(x=[], y=[], hovertext=[], text=[], mode='markers+text', textposition="bottom center", hoverinfo="text", marker={ 'size': 50, 'color': 'LightSkyBlue' }) index = 0 for node in G.nodes(): x, y = G.nodes[node]['pos'] hovertext = "CustomerName: " + str( G.nodes[node]['CustomerName']) + "<br>" + "AccountType: " + str( G.nodes[node]['Type']) text = node1['Account'][index] node_trace['x'] += tuple([x]) node_trace['y'] += tuple([y]) node_trace['hovertext'] += tuple([hovertext]) node_trace['text'] += tuple([text]) index = index + 1 traceRecode.append(node_trace) ################################################################################################################################################################ middle_hover_trace = go.Scatter(x=[], y=[], hovertext=[], mode='markers', hoverinfo="text", marker={ 'size': 20, 'color': 'LightSkyBlue' }, opacity=0) index = 0 for edge in G.edges: x0, y0 = G.nodes[edge[0]]['pos'] x1, y1 = G.nodes[edge[1]]['pos'] hovertext = "From: " + str( G.edges[edge]['Source']) + "<br>" + "To: " + str( G.edges[edge]['Target']) + "<br>" + "TransactionAmt: " + str( G.edges[edge]['TransactionAmt'] ) + "<br>" + "TransactionDate: " + str(G.edges[edge]['Date']) middle_hover_trace['x'] += tuple([(x0 + x1) / 2]) middle_hover_trace['y'] += tuple([(y0 + y1) / 2]) middle_hover_trace['hovertext'] += tuple([hovertext]) index = index + 1 traceRecode.append(middle_hover_trace) ################################################################################################################################################################# figure = { "data": traceRecode, "layout": go.Layout(title='Interactive Transaction Visualization', showlegend=False, hovermode='closest', margin={ 'b': 40, 'l': 40, 'r': 40, 't': 40 }, xaxis={ 'showgrid': False, 'zeroline': False, 'showticklabels': False }, yaxis={ 'showgrid': False, 'zeroline': False, 'showticklabels': False }, height=600, clickmode='event+select', annotations=[ dict(ax=(G.nodes[edge[0]]['pos'][0] + G.nodes[edge[1]]['pos'][0]) / 2, ay=(G.nodes[edge[0]]['pos'][1] + G.nodes[edge[1]]['pos'][1]) / 2, axref='x', ayref='y', x=(G.nodes[edge[1]]['pos'][0] * 3 + G.nodes[edge[0]]['pos'][0]) / 4, y=(G.nodes[edge[1]]['pos'][1] * 3 + G.nodes[edge[0]]['pos'][1]) / 4, xref='x', yref='y', showarrow=True, arrowhead=3, arrowsize=4, arrowwidth=1, opacity=1) for edge in G.edges ]) } return figure
def test_from_edgelist_no_attr(self): Gtrue = nx.Graph([('E', 'C', {}), ('B', 'A', {}), ('A', 'D', {})]) G = nx.from_pandas_edgelist(self.df, 0, 'b',) assert_graphs_equal(G, Gtrue)
import networkx as nx import pandas as pd import matplotlib.pyplot as plt edge_list = pd.read_csv('stack_network_links.csv') G = nx.from_pandas_edgelist(edge_list) plt.figure(figsize=(20, 20)) nx.draw(G, with_labels=True, edge_color='grey', node_color='blue', node_size=10, pos=nx.spring_layout(G, k=0.2, iterations=50)) # iterations 迭代优化,没有将距离相近的节点拉近。 # k是每个节点之间的距离【0,1】越大距离越远。 plt.show()