Esempio n. 1
0
 def test_from_edgelist_multidigraph_and_edge_attr(self):
     # example from issue #2374
     Gtrue = nx.MultiDiGraph([('X1', 'X4', {'Co': 'zA', 'Mi': 0, 'St': 'X1'}),
                              ('X1', 'X4', {'Co': 'zB', 'Mi': 54, 'St': 'X2'}),
                              ('X1', 'X4', {'Co': 'zB', 'Mi': 49, 'St': 'X3'}),
                              ('X1', 'X4', {'Co': 'zB', 'Mi': 44, 'St': 'X4'}),
                              ('Y1', 'Y3', {'Co': 'zC', 'Mi': 0, 'St': 'Y1'}),
                              ('Y1', 'Y3', {'Co': 'zC', 'Mi': 34, 'St': 'Y2'}),
                              ('Y1', 'Y3', {'Co': 'zC', 'Mi': 29, 'St': 'X2'}),
                              ('Y1', 'Y3', {'Co': 'zC', 'Mi': 24, 'St': 'Y3'}),
                              ('Z1', 'Z3', {'Co': 'zD', 'Mi': 0, 'St': 'Z1'}),
                              ('Z1', 'Z3', {'Co': 'zD', 'Mi': 14, 'St': 'X3'}),
                              ('Z1', 'Z3', {'Co': 'zE', 'Mi': 9, 'St': 'Z2'}),
                              ('Z1', 'Z3', {'Co': 'zE', 'Mi': 4, 'St': 'Z3'})])
     df = pd.DataFrame.from_items([
         ('O', ['X1', 'X1', 'X1', 'X1', 'Y1', 'Y1', 'Y1', 'Y1', 'Z1', 'Z1', 'Z1', 'Z1']),
         ('D', ['X4', 'X4', 'X4', 'X4', 'Y3', 'Y3', 'Y3', 'Y3', 'Z3', 'Z3', 'Z3', 'Z3']),
         ('St', ['X1', 'X2', 'X3', 'X4', 'Y1', 'Y2', 'X2', 'Y3', 'Z1', 'X3', 'Z2', 'Z3']),
         ('Co', ['zA', 'zB', 'zB', 'zB', 'zC', 'zC', 'zC', 'zC', 'zD', 'zD', 'zE', 'zE']),
         ('Mi', [0,   54,   49,   44,    0,   34,   29,   24,    0,   14,    9,   4])])
     G1 = nx.from_pandas_edgelist(df, source='O', target='D',
                                  edge_attr=True,
                                  create_using=nx.MultiDiGraph())
     G2 = nx.from_pandas_edgelist(df, source='O', target='D',
                                  edge_attr=['St', 'Co', 'Mi'],
                                  create_using=nx.MultiDiGraph())
     assert_graphs_equal(G1, Gtrue)
     assert_graphs_equal(G2, Gtrue)
Esempio n. 2
0
 def test_from_edgelist_all_attr(self):
     Gtrue = nx.Graph([('E', 'C', {'cost': 9, 'weight': 10}),
                       ('B', 'A', {'cost': 1, 'weight': 7}),
                       ('A', 'D', {'cost': 7, 'weight': 4})])
     G = nx.from_pandas_edgelist(self.df, 0, 'b', True)
     assert_graphs_equal(G, Gtrue)
     # MultiGraph
     MGtrue = nx.MultiGraph(Gtrue)
     MGtrue.add_edge('A', 'D', cost=16, weight=4)
     MG = nx.from_pandas_edgelist(self.mdf, 0, 'b', True, nx.MultiGraph())
     assert_graphs_equal(MG, MGtrue)
Esempio n. 3
0
File: io.py Progetto: wiheto/teneto
def tnet_to_nx(df, t=None):
    """
    Creates undirected networkx object
    """
    if t is not None:
        df = get_network_when(df, t=t)
    if 'weight' in df.columns:
        nxobj = nx.from_pandas_edgelist(
            df, source='i', target='j', edge_attr='weight')
    else:
        nxobj = nx.from_pandas_edgelist(df, source='i', target='j')
    return nxobj
Esempio n. 4
0
 def test_from_edgelist_int_attr_name(self):
     # note: this also tests that edge_attr can be `source`
     Gtrue = nx.Graph([('E', 'C', {0: 'C'}),
                       ('B', 'A', {0: 'B'}),
                       ('A', 'D', {0: 'A'})])
     G = nx.from_pandas_edgelist(self.df, 0, 'b', 0)
     assert_graphs_equal(G, Gtrue)
Esempio n. 5
0
 def test_roundtrip(self):
     # edgelist
     Gtrue = nx.Graph([(1, 1), (1, 2)])
     df = nx.to_pandas_edgelist(Gtrue)
     G = nx.from_pandas_edgelist(df)
     assert_graphs_equal(Gtrue, G)
     # adjacency
     Gtrue = nx.Graph(({1: {1: {'weight': 1}, 2: {'weight': 1}}, 2: {1: {'weight': 1}}}))
     df = nx.to_pandas_adjacency(Gtrue, dtype=int)
     G = nx.from_pandas_adjacency(df)
     assert_graphs_equal(Gtrue, G)
def download_reactome_fi():
    """
    Downloads reactome functional interaction network

    Returns
    -------

    """
    url = 'http://reactomews.oicr.on.ca:8080/caBigR3WebApp2017/' \
          'FIsInGene_071718_with_annotations.txt.zip'
    table = pd.read_csv(io.BytesIO(urlopen(url).read()), compression='zip',
                        delimiter='\t', error_bad_lines=False, encoding='utf-8'
                        )
    table = table[table['Direction'] != '-']
    table = table[~table['Annotation'].str.contains('indirect effect')]
    table = table[~table['Annotation'].str.contains('predicted')]
    table = table[~table['Annotation'].str.contains('compound')]
    genes = set(table['Gene1'])
    genes.update(set(table['Gene2']))
    from magine.mappings.gene_mapper import GeneMapper
    gm = GeneMapper()
    missing_uniprot = set(i for i in genes if i not in gm.gene_name_to_uniprot)
    table = table[~table['Gene1'].isin(missing_uniprot)]
    table = table[~table['Gene2'].isin(missing_uniprot)]

    table['source'] = table['Gene1']
    table['target'] = table['Gene2']
    table['databaseSource'] = 'ReactomeFI'
    rev_cols = table['Direction'].isin(_reverse)

    table.loc[rev_cols, ['source', 'target']] = \
        table.loc[rev_cols, ['target', 'source']].values

    table['interactionType'] = table.apply(standardize_edge_types, axis=1)
    protein_graph = nx.from_pandas_edgelist(
        table,
        'source',
        'target',
        edge_attr=['interactionType', 'databaseSource'],
        create_using=nx.DiGraph()
    )
    species = set(table['source'].unique()
                  ).union(set(table['target'].unique()))

    # add names to graph
    for node in species:
        protein_graph.add_node(node, databaseSource='ReactomeFI',
                               speciesType='gene')

    print("Reactome network has {} nodes and {} edges"
          "".format(len(protein_graph.nodes()), len(protein_graph.edges())))

    nx.write_gpickle(protein_graph, _p_name)
Esempio n. 7
0
def PageRank(data):
	print('graph generating...')
	G_ui = nx.from_pandas_edgelist(df=data, source='user_id', target='item_id', edge_attr=False)
	pagerank = pd.DataFrame(list(nx.pagerank(G_ui).items()), columns=['node', 'pagerank'])

	print('merging...')
	data = data.merge(pagerank, left_on='user_id', right_on='node', how='left').merge(pagerank, left_on='item_id', right_on='node', how='left')

	pagerank_data = pd.DataFrame(columns=['instance_id', 'user_pagerank', 'item_pagerank'])
	pagerank_data['instance_id'] = data['instance_id']
	pagerank_data['user_pagerank'] = data['pagerank_x']
	pagerank_data['item_pagerank'] = data['pagerank_y']

	return pagerank_data
Esempio n. 8
0
 def test_from_edgelist(self):
     # Pandas DataFrame
     g = nx.cycle_graph(10)
     G = nx.Graph()
     G.add_nodes_from(g)
     G.add_weighted_edges_from((u, v, u) for u, v in g.edges())
     edgelist = nx.to_edgelist(G)
     source = [s for s, t, d in edgelist]
     target = [t for s, t, d in edgelist]
     weight = [d['weight'] for s, t, d in edgelist]
     edges = pd.DataFrame({'source': source,
                           'target': target,
                           'weight': weight})
     GG = nx.from_pandas_edgelist(edges, edge_attr='weight')
     assert_nodes_equal(G.nodes(), GG.nodes())
     assert_edges_equal(G.edges(), GG.edges())
     GW = nx.to_networkx_graph(edges, create_using=nx.Graph())
     assert_nodes_equal(G.nodes(), GW.nodes())
     assert_edges_equal(G.edges(), GW.edges())
Esempio n. 9
0
def download_trrust():
    table = pd.read_csv(url,
                        names=['source', 'target', 'interactionType', 'pmid'],
                        delimiter='\t', index_col=None,
                        error_bad_lines=False, encoding='utf-8'
                        )
    print(table.head(10))

    # filter out non d
    table = table[~(table['interactionType'] == 'Unknown')].copy()

    table.loc[table[
                  'interactionType'] == 'Activation', 'interactionType'] = 'activate|expression'
    table.loc[table[
                  'interactionType'] == 'Repression', 'interactionType'] = 'inhibit|repression'
    table = table[~(table['interactionType'] == 'Unknown')].copy()

    table['databaseSource'] = 'TRRUST'

    protein_graph = nx.from_pandas_edgelist(
        table,
        'source',
        'target',
        edge_attr=['interactionType', 'pmid'],
        create_using=nx.DiGraph()
    )

    table = table[['source', 'target']].values
    added_genes = set()

    def _add_node(node):
        if node not in added_genes:
            protein_graph.add_node(node, databaseSource='TRRUST',
                                   speciesType='gene')
            added_genes.add(node)

    # add names to graph
    for r in table:
        _add_node(r[0])
        _add_node(r[1])

    nx.write_gpickle(protein_graph, _p_name)
Esempio n. 10
0
    def __init__(self, ctfile, data=None, name='', file=None, **attr):
        # Read in the CT file as a pandas DataFrame.
        self.ctfile = ctfile
        edge_df = pd.read_fwf(ctfile, skiprows=1, header=None)
        edge_df.columns = ['position', 'letter', 'pos-1', 'pos+1', 'pair_pos',
                           'position_repeated']
        edge_df['letter'] = \
            edge_df['letter'].apply(lambda x: x.upper().replace("T", "U"))
        edge_df['kind'] = 'basepair'
        del edge_df['position_repeated']

        # Construct graph
        self.graph = nx.from_pandas_edgelist(edge_df,
                                             source='position',
                                             target='pair_pos',
                                             edge_attr='kind')
        # Annotate pairing partners on nodes.
        for n1, n2, d in self.graph.edges(data=True):
            self.graph.node[n1]['pairing_partner'] = n2
            self.graph.node[n2]['pairing_partner'] = n1

        # Add in backbone edges
        for n1, n2 in zip(sorted(self.graph.nodes()),
                          sorted(self.graph.nodes())[1:]):
            self.graph.add_edge(n1, n2, kind='backbone')

        # Add in node metadata
        for r, d in edge_df.iterrows():
            self.graph.node[d['position']]['letter'] = d['letter']

        # Remove the unnecessary node zero.
        self.graph.remove_node(0)

        # Annotate the graph with vectorized features.
        self.annotate()

        self._edge_df = edge_df

        # miRNA name
        self.mirna_name = ctfile.split('/')[-1].split('_')[0]
Esempio n. 11
0
 def test_from_edgelist_multi_attr(self):
     Gtrue = nx.Graph([('E', 'C', {'cost': 9, 'weight': 10}),
                       ('B', 'A', {'cost': 1, 'weight': 7}),
                       ('A', 'D', {'cost': 7, 'weight': 4})])
     G = nx.from_pandas_edgelist(self.df, 0, 'b', ['weight', 'cost'])
     assert_graphs_equal(G, Gtrue)
Esempio n. 12
0
 def test_from_edgelist_multi_attr_incl_target(self):
     Gtrue = nx.Graph([('E', 'C', {0: 'C', 'b': 'E', 'weight': 10}),
                       ('B', 'A', {0: 'B', 'b': 'A', 'weight': 7}),
                       ('A', 'D', {0: 'A', 'b': 'D', 'weight': 4})])
     G = nx.from_pandas_edgelist(self.df, 0, 'b', [0, 'b', 'weight'])
     assert_graphs_equal(G, Gtrue)
Esempio n. 13
0
    return nx.convert_node_labels_to_integers(graph)


if __name__ == "__main__":
    default_fname = "BIOGRID-ORGANISM-Human_Herpesvirus_6B-3.5.165.tab2_duplicate.txt"
    fname, do_centrality, do_draw, do_info = parse(default_fname)
    starttime = time.time()

    df_ppin = load_ppin(fname)

    colA_name, colB_name = "BioGRID ID Interactor A", "BioGRID ID Interactor B"
    colOffA_name, colOffB_name = "Official Symbol Interactor A", "Official Symbol Interactor B"

    # draw graph
    graph = nx.from_pandas_edgelist(
        df_ppin[[colA_name, colB_name]], colA_name,
        colB_name)  # need to give a directionality here - just ignore
    graph.remove_edges_from(
        graph.selfloop_edges())  # gets rid of self loops (A->A)
    graph = graph.to_undirected(
    )  # gets rid of duplicates (A->B, A->B) and inverse duplicates (A->B, B->A)
    print("building graph took " + str(round(time.time() - starttime, 5)) +
          " s")
    #print(fname, graph.number_of_nodes(), graph.number_of_edges(), time.time() - starttime) # for import into csv
    nx.write_edgelist(graph,
                      "../biograd-organism/ppin/" + fname + ".edgeList",
                      delimiter='\t')

    # save correspondence between biogrid ID and official symbol
    interactorA = flatten(df_ppin, colA_name)
    officialSymbolA = flatten(df_ppin, colOffA_name)
Esempio n. 14
0
    def _thin_network(self):
        '''
        Returns a network with elgible edges
        merged).
        '''

        self._report_duplicate_edges()
        cols = self.config['intermediate_keep_columns'] + self.config[
            'dir_columns'] + self.config['dir_toll_columns']

        # need to remove any links that are one-way,
        # but share the reverse node sequence
        # these are merged back in after thinning.
        thin_edges = self.network_gdf.copy()
        merge_edges = self.network_gdf.copy()[['INode', 'JNode']]
        merge_edges = merge_edges.rename(columns={
            'INode': 'INode_y',
            'JNode': 'JNode_y'
        })
        one_way_keep = thin_edges.merge(merge_edges,
                                        how='inner',
                                        left_on=['INode', 'JNode'],
                                        right_on=['JNode_y', 'INode_y'])
        thin_edges = thin_edges[~thin_edges['PSRCEdgeID'].
                                isin(one_way_keep['PSRCEdgeID'].tolist())]

        G = nx.from_pandas_edgelist(thin_edges, 'INode', 'JNode', cols)

        i = 0
        node_list = [x for x in self.thin_nodes_list if G.has_node(x)]
        for node in node_list:
            if i % 1000 == 0:
                print("%d Nodes Processed" % (i))
            edges = list(G.edges(node))
            check_edges = self._check_edge_connection_validity(node, edges, G)
            if check_edges:
                edge_1 = check_edges[0]
                edge_2 = check_edges[1]
                a_coords = list(edge_1['geometry'].coords)
                b_coords = list(edge_2['geometry'].coords)
                # get the first and last coord for the two edges
                a_test = [a_coords[0], a_coords[-1]]
                b_test = [b_coords[0], b_coords[-1]]

                if edge_1['INode'] != edge_2['INode'] and edge_1[
                        'JNode'] != edge_2['JNode']:

                    edge_dir = 'with'
                    merge = self._compare_attributes(edge_1, edge_2, 'IJ')

                else:
                    edge_dir = 'against'
                    merge = self._compare_attributes(edge_1, edge_2, 'JI')

                if merge:
                    if edge_dir == 'with':
                        # Do the first coords match or the first and last
                        if a_test.index(
                                list(
                                    set(a_test).intersection(b_test))[0]) == 0:
                            order = 'ba'
                            a_coords.pop(0)
                            x = b_coords + a_coords
                            line = LineString(x)
                            merged_row = edge_2
                            merged_row['geometry'] = line
                            merged_row['JNode'] = edge_1['JNode']
                            if G.has_edge(merged_row['INode'],
                                          merged_row['JNode']):
                                compare_edge = G.get_edge_data(
                                    merged_row['INode'], merged_row['JNode'])
                                if list(compare_edge['geometry'].coords) == x:
                                    print 'True'
                                    G.remove_edge(edges[0][0], edges[0][1])
                                    G.remove_edge(edges[1][0], edges[1][1])
                            else:
                                G.remove_edge(edges[0][0], edges[0][1])
                                G.remove_edge(edges[1][0], edges[1][1])
                                G.add_edge(merged_row['INode'],
                                           merged_row['JNode'], **merged_row)

                        else:
                            order = 'ab'
                            b_coords.pop(0)
                            x = a_coords + b_coords
                            line = LineString(x)
                            merged_row = edge_1
                            merged_row['geometry'] = line
                            merged_row['JNode'] = edge_2['JNode']
                            if G.has_edge(merged_row['INode'],
                                          merged_row['JNode']):
                                compare_edge = G.get_edge_data(
                                    merged_row['INode'], merged_row['JNode'])
                                if list(compare_edge['geometry'].coords) == x:
                                    print 'True'
                                    G.remove_edge(edges[0][0], edges[0][1])
                                    G.remove_edge(edges[1][0], edges[1][1])
                            else:
                                G.remove_edge(edges[0][0], edges[0][1])
                                G.remove_edge(edges[1][0], edges[1][1])
                                G.add_edge(merged_row['INode'],
                                           merged_row['JNode'], **merged_row)

                    # Are lines digitized towards each other:
                    elif edge_1['JNode'] == edge_2['JNode']:
                        # Flip the b line
                        b_coords.reverse()
                        # Drop the duplicate coord
                        b_coords.pop(0)
                        x = a_coords + b_coords
                        line = LineString(x)
                        merged_row = edge_1
                        merged_row['geometry'] = line
                        merged_row['INode'] = edge_1['INode']
                        merged_row['JNode'] = edge_2['INode']
                        if G.has_edge(merged_row['INode'],
                                      merged_row['JNode']):
                            compare_edge = G.get_edge_data(
                                merged_row['INode'], merged_row['JNode'])
                            if list(compare_edge['geometry'].coords) == x:
                                print 'True'
                                G.remove_edge(edges[0][0], edges[0][1])
                                G.remove_edge(edges[1][0], edges[1][1])
                        else:
                            G.remove_edge(edges[0][0], edges[0][1])
                            G.remove_edge(edges[1][0], edges[1][1])
                            G.add_edge(merged_row['INode'],
                                       merged_row['JNode'], **merged_row)

                    # Lines must be digitized away from each other:
                    else:
                        # Drop the duplicate coord
                        b_coords.pop(0)
                        # Flip the b line
                        b_coords.reverse()
                        x = b_coords + a_coords
                        line = LineString(x)
                        merged_row = edge_1
                        merged_row['geometry'] = line
                        merged_row['INode'] = edge_2['JNode']
                        merged_row['JNode'] = edge_1['JNode']
                        if G.has_edge(merged_row['INode'],
                                      merged_row['JNode']):
                            compare_edge = G.get_edge_data(
                                merged_row['INode'], merged_row['JNode'])
                            if list(compare_edge['geometry'].coords) == x:
                                G.remove_edge(edges[0][0], edges[0][1])
                                G.remove_edge(edges[1][0], edges[1][1])
                        else:
                            G.remove_edge(edges[0][0], edges[0][1])
                            G.remove_edge(edges[1][0], edges[1][1])
                            G.add_edge(merged_row['INode'],
                                       merged_row['JNode'], **merged_row)

            i = i + 1

        edge_list = []
        for x in G.edges.iteritems():
            edge_list.append(x[1])
        gdf = gpd.GeoDataFrame(edge_list)
        gdf = gdf.append(one_way_keep[cols])

        return (gdf)
          JOIN theta_plus.imm1985_1995_article_score_unshuffled asu ON asu.scp = cslu.scp
          WHERE asu.article_score >= 1) cslu1 ON cslu1.scp = ccu.citing
    JOIN (SELECT cslu.*
          FROM theta_plus.imm1985_1995_cluster_scp_list_mcl cslu
          JOIN theta_plus.imm1985_1995_article_score_unshuffled asu ON asu.scp = cslu.scp
          WHERE asu.article_score >= 1) cslu2 ON cslu2.scp = ccu.cited
    WHERE  cslu1.cluster_no!=cslu2.cluster_no AND cslu2.cluster_no= """ + str(
            cluster_num) + """; -- all external in-degrees"""

    cluster_scp_query = """SELECT * 
    FROM theta_plus.imm1985_1995_cluster_scp_list_mcl
    WHERE cluster_no = """ + str(cluster_num) + """;"""

    citing_cited = pd.read_sql(citing_cited_query, con=engine)
    G = nx.from_pandas_edgelist(citing_cited,
                                'citing',
                                'cited',
                                create_using=nx.DiGraph())
    N = G.order()
    degrees = dict(G.degree())
    total_deg = pd.DataFrame.from_dict(degrees,
                                       orient='index',
                                       columns=['ext_cluster_total_degrees'])
    total_deg['scp'] = total_deg.index
    total_deg = total_deg.reset_index(drop=True)

    indegrees = dict(G.in_degree())
    total_in_deg = pd.DataFrame.from_dict(indegrees,
                                          orient='index',
                                          columns=['ext_cluster_in_degrees'])
    total_in_deg['scp'] = total_in_deg.index
    total_in_deg = total_in_deg.reset_index(drop=True)
Esempio n. 16
0
def create_diffusion_graph(twitter_corpus_file, diffusion_graph_file):

    diffusion_graph_dir = '/'.join(diffusion_graph_file.split('/')[:-1]) + '/'

    #initialize graph
    G = nx.DiGraph()

    for v in institutions['URL'].tolist():
        G.add_edge(v, graph_nodes['institution'])

    for v in repositories['URL'].tolist():
        G.add_edge(v, graph_nodes['repository'])

    G.add_edge(graph_nodes['institution'], graph_nodes['source'])
    G.add_edge(graph_nodes['repository'], graph_nodes['source'])

    epoch = 0
    frontier = []
    connected_components = 0
    last_pass = False
    while True:

        #expand graph
        if not os.path.exists(diffusion_graph_dir + 'epoch_' + str(epoch) +
                              '.tsv'):
            graph_epoch_n(frontier, epoch, last_pass, twitter_corpus_file,
                          diffusion_graph_dir)

        df = pd.read_csv(diffusion_graph_dir + 'epoch_' + str(epoch) + '.tsv',
                         sep='\t').dropna()
        G = nx.compose(
            G,
            nx.from_pandas_edgelist(df,
                                    source='source_url',
                                    target='target_url',
                                    create_using=nx.DiGraph()))
        frontier = [x for x in G.nodes() if G.out_degree(x) == 0]

        print('Epoch:', epoch)
        print('Connected Components:',
              nx.number_connected_components(G.to_undirected()))
        print('Frontier Size:', len(frontier))

        if last_pass:
            break

        #last pass condition
        if epoch != 0 and (connected_components -
                           nx.number_connected_components(G.to_undirected())
                           ) / connected_components < components_ratio:
            last_pass = True
        connected_components = nx.number_connected_components(
            G.to_undirected())
        epoch += 1

    #add root node
    df = pd.read_csv(diffusion_graph_dir + 'epoch_0.tsv', sep='\t').dropna()
    df['social'] = project_url + '#twitter'
    G = nx.compose(
        G,
        nx.from_pandas_edgelist(df,
                                source='social',
                                target='source_url',
                                create_using=nx.DiGraph()))

    write_graph(G, diffusion_graph_file)
    file = filenames[i]

    # split into variables
    confidence, graphs, method, edgefunc = file.split('.')[0].split('_')
    confidence = confidence[10:]
    graphs = graphs[7:]

    if 'fiberlength' in file:
        df = pd.read_csv(dir_ + file, delimiter=';')
        df = df.rename(columns={'edge weight(med flm)': 'weight'})
        df['len'] = df['weight']

        # generate digraph
        H = nx.from_pandas_edgelist(df,
                                    source='id node1',
                                    target='id node2',
                                    edge_attr=['weight', 'len'],
                                    create_using=nx.Graph())

        # remove self loops
        H.remove_edges_from(nx.selfloop_edges(H))

        # add nodes even though they have no edges (to make comparison more fair)
        if confidence == '20': full_set = set(H)
        else: H.add_nodes_from(full_set - set(H))

        # for centrality
        cc = nx.closeness_centrality(H)
        cc = {k: v for k, v in sorted(cc.items(), key=lambda s: s[0])}
        closeness_centrality.append(list(cc.values()))
import numpy as np
import matplotlib.pyplot as plt
from modularitydensity import metrics
from modularitydensity.fine_tuned_modularity import fine_tuned_clustering_q
from modularitydensity.fine_tuned_modularity_density import fine_tuned_clustering_qds


def mapname(name):
    print(name)
    return name.lower()


df = pd.read_csv('cc9_rel_undirected_nozeroes.csv')
df = df.rename(mapper=mapname, axis='columns')
print(df)
G = nx.from_pandas_edgelist(df, edge_attr=['weight', 'change'])
# G = nx.les_miserables_graph()
G = nx.relabel.convert_node_labels_to_integers(G)

print(G)

adj = nx.to_scipy_sparse_matrix(G)

for gr in nx.connected_component_subgraphs(G):
    # Nodes of the subgraph 'gr'
    nodes_gr = list(gr)
    print(nodes_gr)

c = fine_tuned_clustering_q(G)
print(c)
Q = metrics.modularity_r(adj, c, np.unique(c), r=0)
Esempio n. 19
0
def build_and_save_edgefile(
    db,
    filename='edges.csv',
    business_category=None,
    state=None,
    city=None,
    backbone_extract=True,
    backbone_alpha=0.4,
):
    """
    Builds a business-review network for the given geographic context and saves
    the edgelist to a file

    Parameterss
    ----------
    db : pymongo object
        The pymongo object that can be used to query the database
    filename : string
        Path of the file to save the graph edgelist
    state : string
        The state within which to query businesses, e.g., AZ, NC, PA
    city : string
        The city within which to query businesses, e.g., 'Phoenix', 'Charlotte'
    business_category:
        The business category to query for, e.g., Restraunt, Cafe, etc.
    backbone_extract: boolean
        True if backbone extraction should be applied to the constructed network
    backbone_alpha: double
        Between 0 and 1. If backbone_extraxt is True, this is the alpha valued
        that is used to determine how aggreeively to prune edges. Low values
        are more aggressive

    Returns
    -------
    out:
        A networkx object constructed using the given geogrpahic context
    """
    import pandas as pd
    import networkx as nx
    from .network_utils import extract_backbone

    print("Building business-review list")
    business_review_list = buildBusinessUserList(
        db, business_category=business_category, state=state, city=city)

    print("Building edgelist")
    edgelist = buildEdgeList(business_review_list,
                             column_name="businesses",
                             threshold=1)
    edge_df = pd.DataFrame(edgelist)

    G = nx.from_pandas_edgelist(edge_df, edge_attr=True)

    if (backbone_extract):
        print("Extracting backbone")
        G = extract_backbone(G, backbone_alpha)

    # Save edge list
    print('Saving the file')
    nx.write_edgelist(G, filename)
    print('Done')

    # Done, return the graph object
    return (G)
Esempio n. 20
0
    def __buid_graph(self,
                     crowdtangle_shares_df,
                     coordinated_shares_df,
                     percentile_edge_weight=90,
                     timestamps=False):
        logger.info("Bulding graph")
        coord_df = coordinated_shares_df[['account_url', 'url',
                                          'share_date']].reset_index(drop=True)
        coord_graph = nx.from_pandas_edgelist(coord_df,
                                              'account_url',
                                              'url',
                                              create_using=nx.DiGraph())

        # Remove self loop node edges
        coord_graph.remove_edges_from(nx.selfloop_edges(coord_graph))

        #Bipartite graph creation
        account_urls = list(coordinated_shares_df['account_url'].unique())
        urls = list(coordinated_shares_df['url'].unique())

        bipartite_graph = nx.Graph()
        logger.debug('adding nodes')
        bipartite_graph.add_nodes_from(urls, bipartite=0)
        bipartite_graph.add_nodes_from(account_urls, bipartite=1)
        logger.debug('Adding edges')
        for index, row in coord_df.iterrows():
            bipartite_graph.add_edge(row['account_url'],
                                     row['url'],
                                     share_date=row['share_date'])

        #Graph projection with account nodes
        logger.debug('Projecting graph')
        full_graph = bipartite.weighted_projected_graph(
            bipartite_graph, account_urls)

        #pandas helper dataframe to calcule graph node attribues
        crowdtangle_shares_df['account_name'] = crowdtangle_shares_df[
            'account_name'].astype(str)
        crowdtangle_shares_df['account_handle'] = crowdtangle_shares_df[
            'account_handle'].astype(str)
        crowdtangle_shares_df[
            'account_pageAdminTopCountry'] = crowdtangle_shares_df[
                'account_pageAdminTopCountry'].astype(str)
        crowtangle_shares_gb = crowdtangle_shares_df.groupby('account_url')
        crowdtangle_shares_df['name_changed'] = (
            crowtangle_shares_gb['account_name'].transform("nunique")) > 1
        crowdtangle_shares_df['handle_changed'] = (
            crowtangle_shares_gb['account_handle'].transform("nunique")) > 1
        crowdtangle_shares_df['page_admin_top_country_changed'] = (
            crowtangle_shares_gb['account_pageAdminTopCountry'].transform(
                "nunique")) > 1
        crowdtangle_shares_df['account_name'] = crowtangle_shares_gb[
            'account_name'].transform(lambda col: '|'.join(col.unique()))
        crowdtangle_shares_df['account_handle'] = crowtangle_shares_gb[
            'account_handle'].transform(lambda col: '|'.join(col.unique()))
        crowdtangle_shares_df[
            'account_pageAdminTopCountry'] = crowtangle_shares_gb[
                'account_pageAdminTopCountry'].transform(
                    lambda col: '|'.join(col.unique()))
        crowdtangle_shares_df[[
            'account_name', 'account_handle', 'account_pageAdminTopCountry',
            'name_changed', 'handle_changed', 'page_admin_top_country_changed'
        ]]

        crowtangle_shares_gb = crowdtangle_shares_df.reset_index().groupby(
            ['account_url'])

        account_info_df = crowtangle_shares_gb['index'].agg([('shares',
                                                              'count')])
        account_info_df = account_info_df.merge(pd.DataFrame(
            crowtangle_shares_gb['is_coordinated'].apply(
                lambda x: (x == True).sum())).rename(
                    columns={'is_coordinated': 'coord_shares'}),
                                                left_index=True,
                                                right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['account_subscriberCount'].agg([
                ('avg_account_subscriber_count', 'mean')
            ]),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['account_name'].agg([('account_name', 'first')
                                                      ]),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['name_changed'].agg('first'),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['handle_changed'].agg('first'),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['page_admin_top_country_changed'].agg(
                'first'),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['account_pageAdminTopCountry'].agg([
                ('account_page_admin_top_country', 'first')
            ]),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['account_handle'].agg([('account_handle',
                                                         'first')]),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['account_platform'].agg([('account_platform',
                                                           'first')]),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['account_platformId'].agg([
                ('account_platformId', 'first')
            ]),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['account_verified'].agg([('account_verified',
                                                           'first')]),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.merge(
            crowtangle_shares_gb['account_accountType'].agg([
                ('account_account_type', 'first')
            ]),
            left_index=True,
            right_index=True)
        account_info_df = account_info_df.reset_index().rename(
            columns={'account_url': 'account_url'})

        #filter the dataframe with the graph nodes
        node_info_df = account_info_df[account_info_df['account_url'].isin(
            list(full_graph.nodes))]

        attributes = []
        for node in full_graph.nodes():
            records = node_info_df[node_info_df['account_url'] == node]
            attributes.append(node)
            attributes.append({
                'shares':
                records['shares'].values[0],
                'coord_shares':
                records['coord_shares'].values[0],
                'avg_account_subscriber_count':
                records['avg_account_subscriber_count'].values[0],
                'account_platform':
                records['account_platform'].values[0],
                'account_name':
                records['account_name'].values[0],
                'account_verified':
                1 if records['account_verified'].values[0] else 0,
                'account_handle':
                records['account_handle'].values[0],
                'name_changed':
                1 if records['name_changed'].values[0] else 0,
                'handle_changed':
                1 if records['handle_changed'].values[0] else 0,
                'page_admin_top_country_changed':
                1
                if records['page_admin_top_country_changed'].values[0] else 0,
                'account_page_admin_top_country':
                records['account_page_admin_top_country'].values[0],
                'account_account_type':
                records['account_account_type'].values[0]
            })
        #update graph attributes
        it = iter(attributes)
        nx.set_node_attributes(full_graph, dict(zip(it, it)))

        #set the percentile_edge_weight number of repetedly coordinated link sharing to keep
        q = np.percentile(
            [d['weight'] for (u, v, d) in full_graph.edges(data=True)],
            percentile_edge_weight)

        #create a new graph where node degree > 0
        highly_connected_graph = full_graph.subgraph(
            [key for (key, value) in full_graph.degree if value > 0]).copy()

        #remove where the edge weitght is less than the given percentile value
        edges_to_remove = [
            (u, v) for (u, v, d) in highly_connected_graph.edges(data=True)
            if d['weight'] < q
        ]
        highly_connected_graph.remove_edges_from(edges_to_remove)
        highly_connected_graph.remove_nodes_from(
            list(nx.isolates(highly_connected_graph)))

        if timestamps:
            logger.info("Calculating nodes timestamps")
            vec_func = np.vectorize(
                lambda u, v: bipartite_graph.get_edge_data(u, v)['share_date'])
            attributes = []
            for (u, v) in highly_connected_graph.edges():
                attributes.append((u, v))
                attributes.append({
                    "timestamp_coord_share":
                    vec_func(
                        np.intersect1d(
                            list(list(bipartite_graph.neighbors(u))),
                            list(list(bipartite_graph.neighbors(v)))), u)
                })

            it = iter(attributes)
            nx.set_edge_attributes(highly_connected_graph, dict(zip(it, it)))
            logger.info("timestamps calculated")

        #find and annotate nodes-components
        connected_components = list(
            nx.connected_components(highly_connected_graph))
        components_df = pd.DataFrame({
            "node":
            connected_components,
            "component": [*range(1,
                                 len(connected_components) + 1)]
        })
        components_df['node'] = components_df['node'].apply(lambda x: list(x))
        components_df = components_df.explode('node')

        #add cluster to simplyfy the analysis of large components
        cluster_df = pd.DataFrame(
            community_louvain.best_partition(highly_connected_graph).items(),
            columns=['node', 'cluster'])

        #re-calculate the degree on the graph
        degree_df = pd.DataFrame(list(highly_connected_graph.degree()),
                                 columns=['node', 'degree'])
        #sum up the edge weights of the adjacent edges for each node
        strength_df = pd.DataFrame(list(
            highly_connected_graph.degree(weight='weight')),
                                   columns=['node', 'strength'])

        attributes_df = components_df.merge(cluster_df, on='node').merge(
            degree_df, on='node').merge(strength_df, on='node')

        #update graph attribues
        nx.set_node_attributes(
            highly_connected_graph,
            attributes_df.set_index('node').to_dict('index'))
        logger.info("graph builded")

        return highly_connected_graph, q
Esempio n. 21
0
            else:
                w = G[u][v].get(weight, 1)
        except KeyError:
            w = 0
        # Double count self-loops if the graph is undirected.
        if u == v and not directed:
            w *= 2
        return w - in_degree[u] * out_degree[v] * norm

    Q = sum(val(u, v) for c in communities for u, v in product(c, repeat=2))
    return Q * norm


# graph node and mmCCnode
# graph node and mmCCnode
G2 = nx.from_pandas_edgelist(cc, 'node', 'mmCCNode')
#local brebjes
#nx.draw_networkx(G2)
#plt.show()

# find the conencted componets of the new graph and is the chromosome
# the initial popoulation with2  is concomp
concomp = list(list(nx.connected_components(G2)))

numCluster = len(concomp)


def CrossFirstGen():
    for i in range(1, popoulationInit.shape[1] - 1):
        ran = random.randint(
            1, (popoulationInit.shape[1] - 2))  #is the random  number
Esempio n. 22
0
datasets = [
    "flickrEdges_adj.tsv", "email-EuAll_adj.tsv", "roadNet-TX_adj.tsv",
    "roadNet-PA.adj.tsv", "roadNet-CA_adj.tsv"
]
for data in datasets:
    fileE = pd.read_csv(data, sep='\t')
    #read from file file
    Title = fileE.columns
    print(Title)
    fileE = fileE.rename(columns={
        Title[0]: 'Source',
        Title[1]: 'Target',
        Title[2]: 'Degree'
    })
    Gs = nx.from_pandas_edgelist(fileE, source='Source', target='Target')
    Ga = Gs.to_directed()
    centrality = nx.eigenvector_centrality(Ga, max_iter=20)
    sorted((v, f"{c:0.2f}") for v, c in centrality.items())
    fileE['Eigenvalue'] = np.nan
    n = len(fileE['Source'])
    dataS = fileE['Source']
    dataE = fileE['Eigenvalue']
    dataD = fileE['Degree']
    dataE = np.array(dataE).reshape((len(dataE), 1))
    dataS = np.array(dataS).reshape((len(dataS), 1))
    Data = np.hstack((dataE, dataS))
    data = pd.DataFrame(Data, columns=['Eigen', 'Source'])
    data = data.fillna(0)
    dataEigen = np.array(data['Eigen'])
    threshold = dataEigen.mean()
Esempio n. 23
0
def _boiler_generator_assn(eia_transformed_dfs,
                           eia923_years=pc.working_years['eia923'],
                           eia860_years=pc.working_years['eia860'],
                           debug=False):
    """
    Creates a set of more complete boiler generator associations.

    Creates a unique unit_id_pudl for each collection of boilers and generators
    within a plant that have ever been associated with each other, based on
    the boiler generator associations reported in EIA860. Unfortunately, this
    information is not complete for years before 2014, as the gas turbine
    portion of combined cycle power plants in those earlier years were not
    reporting their fuel consumption, or existence as part of the plants.

    For years 2014 and on, EIA860 contains a unit_id_eia value, allowing the
    combined cycle plant compoents to be associated with each other. For many
    plants not listed in the reported boiler generator associations, it is
    nonetheless possible to associate boilers and generators on a one-to-one
    basis, as they use identical strings to describe the units.

    In the end, between the reported BGA table, the string matching, and the
    unit_id_eia values, it's possible to create a nearly complete mapping of
    the generation units, at least for 2014 and later.

    Args:
        eia_transformed_dfs (dict): a dictionary of post-transform dataframes
            representing the EIA database tables.
        eia923_years (list-like): a list of the years of EIA 923 data that
            should be used to infer the boiler-generator associations. By
            default it is all the working years of data.
        eia860_years (list-like): a list of the years of EIA 860 data that
            should be used to infer the boiler-generator associations. By
            default it is all the working years of data.
        debug (bool): If True, include columns in the returned dataframe
            indicating by what method the individual boiler generator
            associations were inferred.

    Returns:
        eia_transformed_dfs (dict): Returns the same dictionary of dataframes
        that was passed in, and adds a new dataframe to it representing
        the boiler-generator associations as records containing
        plant_id_eia, generator_id, boiler_id, and unit_id_pudl

    Raises:
        AssertionError: If the boiler - generator association graphs are not
            bi-partite, meaning generators only connect to boilers, and boilers
            only connect to generators.
        AssertionError: If all the boilers do not end up with the same unit_id
            each year.
        AssertionError: If all the generators do not end up with the same
            unit_id each year.

    """
    # if you're not ingesting both 860 and 923, the bga is not compilable
    if not (eia860_years and eia923_years):
        return
    # compile and scrub all the parts
    logger.info("Inferring complete EIA boiler-generator associations.")
    bga_eia860 = eia_transformed_dfs['boiler_generator_assn_eia860'].copy()
    bga_eia860 = _restrict_years(bga_eia860, eia923_years, eia860_years)
    bga_eia860['generator_id'] = bga_eia860.generator_id.astype(str)
    bga_eia860['boiler_id'] = bga_eia860.boiler_id.astype(str)
    # bga_eia860 = bga_eia860.drop(['utility_id_eia'], axis=1)

    gen_eia923 = eia_transformed_dfs['generation_eia923'].copy()
    gen_eia923 = _restrict_years(gen_eia923, eia923_years, eia860_years)
    gen_eia923['generator_id'] = gen_eia923.generator_id.astype(str)
    gen_eia923 = gen_eia923.set_index(pd.DatetimeIndex(gen_eia923.report_date))

    gen_eia923_gb = gen_eia923.groupby(
        [pd.Grouper(freq='AS'), 'plant_id_eia', 'generator_id'])
    gen_eia923 = gen_eia923_gb['net_generation_mwh'].sum().reset_index()
    gen_eia923['missing_from_923'] = False

    # compile all of the generators
    gens_eia860 = eia_transformed_dfs['generators_eia860'].copy()
    gens_eia860 = _restrict_years(gens_eia860, eia923_years, eia860_years)
    gens_eia860['generator_id'] = gens_eia860.generator_id.astype(str)
    gens = pd.merge(gen_eia923,
                    gens_eia860,
                    on=['plant_id_eia', 'report_date', 'generator_id'],
                    how='outer')

    gens = gens[[
        'plant_id_eia', 'report_date', 'generator_id', 'unit_id_eia',
        'net_generation_mwh', 'missing_from_923'
    ]].drop_duplicates()

    gens['generator_id'] = gens['generator_id'].astype(str)

    # create the beginning of a bga compilation w/ the generators as the
    # background
    bga_compiled_1 = pd.merge(
        gens,
        bga_eia860,
        on=['plant_id_eia', 'generator_id', 'report_date'],
        how='outer')

    # Create a set of bga's that are linked, directly from bga8
    bga_assn = bga_compiled_1[bga_compiled_1['boiler_id'].notnull()].copy()
    bga_assn['bga_source'] = 'eia860_org'

    # Create a set of bga's that were not linked directly through bga8
    bga_unassn = bga_compiled_1[bga_compiled_1['boiler_id'].isnull()].copy()
    bga_unassn = bga_unassn.drop(['boiler_id'], axis=1)

    # Side note: there are only 6 generators that appear in bga8 that don't
    # apear in gens9 or gens8 (must uncomment-out the og_tag creation above)
    # bga_compiled_1[bga_compiled_1['og_tag'].isnull()]

    bf_eia923 = eia_transformed_dfs['boiler_fuel_eia923'].copy()
    bf_eia923 = _restrict_years(bf_eia923, eia923_years, eia860_years)
    bf_eia923['boiler_id'] = bf_eia923.boiler_id.astype(str)
    bf_eia923['total_heat_content_mmbtu'] = \
        bf_eia923['fuel_consumed_units'] * bf_eia923['fuel_mmbtu_per_unit']
    bf_eia923 = bf_eia923.set_index(pd.DatetimeIndex(bf_eia923.report_date))
    bf_eia923_gb = bf_eia923.groupby(
        [pd.Grouper(freq='AS'), 'plant_id_eia', 'boiler_id'])
    bf_eia923 = bf_eia923_gb.agg({
        'total_heat_content_mmbtu':
        pudl.helpers.sum_na,
    }).reset_index()

    bf_eia923.drop_duplicates(
        subset=['plant_id_eia', 'report_date', 'boiler_id'], inplace=True)

    # Create a list of boilers that were not in bga8
    bf9_bga = bf_eia923.merge(bga_compiled_1,
                              on=['plant_id_eia', 'boiler_id', 'report_date'],
                              how='outer',
                              indicator=True)
    bf9_not_in_bga = bf9_bga[bf9_bga['_merge'] == 'left_only']
    bf9_not_in_bga = bf9_not_in_bga.drop(['_merge'], axis=1)

    # Match the unassociated generators with unassociated boilers
    # This method is assuming that some the strings of the generators and the
    # boilers are the same
    bga_unassn = bga_unassn.merge(
        bf9_not_in_bga[['plant_id_eia', 'boiler_id', 'report_date']],
        how='left',
        left_on=['report_date', 'plant_id_eia', 'generator_id'],
        right_on=['report_date', 'plant_id_eia', 'boiler_id'])
    bga_unassn.sort_values(['report_date', 'plant_id_eia'], inplace=True)
    bga_unassn['bga_source'] = None
    bga_unassn.loc[bga_unassn.boiler_id.notnull(),
                   'bga_source'] = 'string_assn'

    bga_compiled_2 = bga_assn.append(bga_unassn)
    bga_compiled_2.sort_values(['plant_id_eia', 'report_date'], inplace=True)
    bga_compiled_2['missing_from_923'].fillna(value=True, inplace=True)

    # Connect the gens and boilers in units
    bga_compiled_units = bga_compiled_2.loc[
        bga_compiled_2['unit_id_eia'].notnull()]
    bga_gen_units = bga_compiled_units.drop(['boiler_id'], axis=1)
    bga_boil_units = bga_compiled_units[[
        'plant_id_eia', 'report_date', 'boiler_id', 'unit_id_eia'
    ]].copy()
    bga_boil_units.dropna(subset=['boiler_id'], inplace=True)

    # merge the units with the boilers
    bga_unit_compilation = bga_gen_units.merge(
        bga_boil_units,
        how='outer',
        on=['plant_id_eia', 'report_date', 'unit_id_eia'],
        indicator=True)
    # label the bga_source
    bga_unit_compilation. \
        loc[bga_unit_compilation['bga_source'].isnull(),
            'bga_source'] = 'unit_connection'
    bga_unit_compilation.drop(['_merge'], axis=1, inplace=True)
    bga_non_units = bga_compiled_2[bga_compiled_2['unit_id_eia'].isnull()]

    # combine the unit compilation and the non units
    bga_compiled_3 = bga_non_units.append(bga_unit_compilation)

    # resort the records and the columns
    bga_compiled_3.sort_values(['plant_id_eia', 'report_date'], inplace=True)
    bga_compiled_3 = bga_compiled_3[[
        'plant_id_eia', 'report_date', 'generator_id', 'boiler_id',
        'unit_id_eia', 'bga_source', 'net_generation_mwh', 'missing_from_923'
    ]]

    # label plants that have 'bad' generator records (generators that have MWhs
    # in gens9 but don't have connected boilers) create a df with just the bad
    # plants by searching for the 'bad' generators
    bad_plants = bga_compiled_3[(bga_compiled_3['boiler_id'].isnull()) &
                                (bga_compiled_3['net_generation_mwh'] > 0)].\
        drop_duplicates(subset=['plant_id_eia', 'report_date'])
    bad_plants = bad_plants[['plant_id_eia', 'report_date']]

    # merge the 'bad' plants back into the larger frame
    bga_compiled_3 = bga_compiled_3.merge(bad_plants,
                                          how='outer',
                                          on=['plant_id_eia', 'report_date'],
                                          indicator=True)

    # use the indicator to create labels
    bga_compiled_3['plant_w_bad_generator'] = \
        np.where(bga_compiled_3._merge == 'both', True, False)
    # Note: At least one gen has reported MWh in 923, but could not be
    # programmatically mapped to a boiler

    # we don't need this one anymore
    bga_compiled_3 = bga_compiled_3.drop(['_merge'], axis=1)

    # create a label for generators that are unmapped but in 923
    bga_compiled_3['unmapped_but_in_923'] = \
        np.where((bga_compiled_3.boiler_id.isnull()) &
                 ~bga_compiled_3.missing_from_923 &
                 (bga_compiled_3.net_generation_mwh == 0),
                 True,
                 False)

    # create a label for generators that are unmapped
    bga_compiled_3['unmapped'] = np.where(bga_compiled_3.boiler_id.isnull(),
                                          True, False)
    bga_out = bga_compiled_3.drop('net_generation_mwh', axis=1)
    bga_out.loc[bga_out.unit_id_eia.isnull(), 'unit_id_eia'] = None

    bga_for_nx = bga_out[[
        'plant_id_eia', 'report_date', 'generator_id', 'boiler_id',
        'unit_id_eia'
    ]]
    # If there's no boiler... there's no boiler-generator association
    bga_for_nx = bga_for_nx.dropna(subset=['boiler_id']).drop_duplicates()

    # Need boiler & generator specific ID strings, or they look like
    # the same node to NX
    bga_for_nx['generators'] = 'p' + bga_for_nx.plant_id_eia.astype(str) + \
                               '_g' + bga_for_nx.generator_id.astype(str)
    bga_for_nx['boilers'] = 'p' + bga_for_nx.plant_id_eia.astype(str) + \
                            '_b' + bga_for_nx.boiler_id.astype(str)

    # dataframe to accumulate the unit_ids in
    bga_w_units = pd.DataFrame()
    # We want to start our unit_id counter anew for each plant:
    for pid in bga_for_nx.plant_id_eia.unique():
        bga_byplant = bga_for_nx[bga_for_nx.plant_id_eia == pid].copy()

        # Create a graph from the dataframe of boilers and generators. It's a
        # multi-graph, meaning the same nodes can be connected by more than one
        # edge -- this allows us to preserve multiple years worth of boiler
        # generator association information for later inspection if need be:
        bga_graph = nx.from_pandas_edgelist(bga_byplant,
                                            source='generators',
                                            target='boilers',
                                            edge_attr=True,
                                            create_using=nx.MultiGraph())

        # Each connected sub-graph is a generation unit:
        gen_units = [
            bga_graph.subgraph(c).copy()
            for c in nx.connected_components(bga_graph)
        ]

        # Assign a unit_id to each subgraph, and extract edges into a dataframe
        for unit_id, unit in zip(range(len(gen_units)), gen_units):
            # All the boiler-generator association graphs should be bi-partite,
            # meaning generators only connect to boilers, and boilers only
            # connect to generators.
            if not nx.algorithms.bipartite.is_bipartite(unit):
                raise AssertionError(
                    f"Non-bipartite generation unit graph found."
                    f"plant_id_eia={pid}, unit_id_pudl={unit_id}.")
            nx.set_edge_attributes(unit,
                                   name='unit_id_pudl',
                                   values=unit_id + 1)
            new_unit_df = nx.to_pandas_edgelist(unit)
            bga_w_units = bga_w_units.append(new_unit_df)

    bga_w_units = bga_w_units.sort_values(
        ['plant_id_eia', 'unit_id_pudl', 'generator_id', 'boiler_id'])
    bga_w_units = bga_w_units.drop(['source', 'target'], axis=1)

    # Check whether the PUDL unit_id values we've inferred conflict with
    # the unit_id_eia values that were reported to EIA. Are there any PUDL
    # unit_id values that have more than 1 EIA unit_id_eia within them?
    bga_unit_id_eia_counts = \
        bga_w_units.groupby(['plant_id_eia', 'unit_id_pudl'])['unit_id_eia'].\
        nunique().to_frame().reset_index()
    bga_unit_id_eia_counts = bga_unit_id_eia_counts.rename(
        columns={'unit_id_eia': 'unit_id_eia_count'})
    bga_unit_id_eia_counts = pd.merge(bga_w_units,
                                      bga_unit_id_eia_counts,
                                      on=['plant_id_eia', 'unit_id_pudl'])
    too_many_codes = \
        bga_unit_id_eia_counts[bga_unit_id_eia_counts.unit_id_eia_count > 1]
    too_many_codes = \
        too_many_codes[~too_many_codes.unit_id_eia.isnull()].\
        groupby(['plant_id_eia', 'unit_id_pudl'])['unit_id_eia'].unique()
    for row in too_many_codes.iteritems():
        logger.warning(f"Multiple EIA unit codes:"
                       f"plant_id_eia={row[0][0]}, "
                       f"unit_id_pudl={row[0][1]}, "
                       f"unit_id_eia={row[1]}")
    bga_w_units = bga_w_units.drop('unit_id_eia', axis=1)

    # These assertions test that all boilers and generators ended up in the
    # same unit_id across all the years of reporting:
    pgu_gb = bga_w_units.groupby(['plant_id_eia',
                                  'generator_id'])['unit_id_pudl']
    if not (pgu_gb.nunique() == 1).all():
        raise AssertionError("Inconsistent inter-annual BGA assignment!")
    pbu_gb = bga_w_units.groupby(['plant_id_eia', 'boiler_id'])['unit_id_pudl']
    if not (pbu_gb.nunique() == 1).all():
        raise AssertionError("Inconsistent inter-annual BGA assignment!")

    bga_w_units = bga_w_units.drop('report_date', axis=1)
    bga_w_units = bga_w_units[[
        'plant_id_eia', 'unit_id_pudl', 'generator_id', 'boiler_id'
    ]].drop_duplicates()
    bga_out = pd.merge(bga_out,
                       bga_w_units,
                       how='left',
                       on=['plant_id_eia', 'generator_id', 'boiler_id'])
    bga_out['unit_id_pudl'] = (bga_out['unit_id_pudl'].fillna(
        value=0).astype(int))

    if not debug:
        bga_out = bga_out[~bga_out.missing_from_923
                          & ~bga_out.plant_w_bad_generator
                          & ~bga_out.unmapped_but_in_923 & ~bga_out.unmapped]

        bga_out = bga_out.drop([
            'missing_from_923', 'plant_w_bad_generator', 'unmapped_but_in_923',
            'unmapped'
        ],
                               axis=1)
        bga_out = bga_out.drop_duplicates(subset=[
            'report_date', 'plant_id_eia', 'boiler_id', 'generator_id'
        ])

    eia_transformed_dfs['boiler_generator_assn_eia860'] = bga_out

    return eia_transformed_dfs
Esempio n. 24
0
print(df_nodes.head())
print(df_nodes.describe())

#check edges df
print(df_edges.head())
print(df_edges.describe())

#merge nodes and edges df
df_complete = pd.concat([df_nodes, df_edges], axis=1)
print(df_complete.head())

#create a networkx directional graph
G = nx.from_pandas_edgelist(
    df=df_complete,
    source="fromUser",  #fieldname of django qs
    target="toUser",  #fieldname of django qs
    edge_attr=["amount"],  #edge weights from django qs
    create_using=nx.DiGraph  #type of graph (here directional)
)

#inspect graph object
print(nx.info(G))
#check if edge metadata is added correctly
print(list(G.edges(data=True))[0:5])

#add node metadata
#nx.set_node_attributes(G, df_nodes["followerCount"], "followerCount")
for node, metadata in df_nodes.set_index("username").iterrows():
    for key, val in metadata.items():  #treat df features as dict
        G.nodes[node][key] = val
print(list(G.nodes(data=True))[0:5])
Esempio n. 25
0
def centrality_scores():
    """
    Calculating and plotting centrality scores for the FULL Georgia Reply Network
    """
    # Retrieving the FULL Georgia Reply Network: (created in more_complicated_reply_network())
    df = pd.read_csv(m4r_data + "ga_reply_network_full.csv")

    # Converting this to a networkx graph object:
    G = nx.from_pandas_edgelist(df,
                                "Source",
                                "Target", ["Weight"],
                                create_using=nx.DiGraph())

    # Calculating centrality scores:
    in_central = nx.algorithms.centrality.in_degree_centrality(G)  # In-Degree
    out_central = nx.algorithms.centrality.out_degree_centrality(
        G)  # Out-Degree
    p_central = nx.algorithms.link_analysis.pagerank_alg.pagerank(
        G)  # PageRank with alpha = 0.85

    # Inserting the scores into a single dataframe:s
    d1 = pd.DataFrame().from_dict(p_central,
                                  orient="index",
                                  columns=["PageRank"]).reset_index()
    d2 = pd.DataFrame().from_dict(in_central,
                                  orient="index",
                                  columns=["In-Degree"]).reset_index()
    d3 = pd.DataFrame().from_dict(out_central,
                                  orient="index",
                                  columns=["Out-Degree"]).reset_index()
    centrality_df = ((d1.merge(d2, on="index")).merge(d3, on="index")).rename(
        {"index": "user.id"}, axis=1)

    # Now retrieving the Louvain community for each account (if the account is in community 8 or 42)
    gephi = pd.read_csv(m4r_data +
                        "ga_reply_network_truncated_louvain_communities.csv")[[
                            "Id", "modularity_class"
                        ]].rename(
                            {
                                "Id": "user.id",
                                "modularity_class": "Community"
                            },
                            axis=1)
    gephi = gephi[gephi["Community"].isin(
        [8, 42]
    )]  # Only care about Community labels for nodes in Communities 8 or 42 (the largest communities)

    # Now retrieving the class (bot or human label)
    users = pickle.load(
        open(m4r_data + "us_and_georgia_accounts.p",
             "rb"))[["user.id", "user.screen_name", "predicted_class"]]

    # Adding account class and community labels to the centrality score dataframe
    centrality_df = centrality_df.merge(users[["user.id", "predicted_class"]],
                                        on="user.id",
                                        how="left").rename(
                                            {"predicted_class": "Class"},
                                            axis=1)
    centrality_df = centrality_df.fillna("Unknown")
    centrality_df = centrality_df.merge(gephi[["user.id", "Community"]],
                                        how="left",
                                        on="user.id")
    centrality_df = (centrality_df.fillna("Other"))
    centrality_df = centrality_df.merge(users[["user.id", "user.screen_name"]],
                                        on="user.id",
                                        how="left")
    centrality_df["Class"] = centrality_df["Class"].replace({
        "human": "Human",
        "bot": "Bot"
    })
    centrality_df["Community"] = centrality_df["Community"].replace({
        8:
        "Group 8",
        42:
        "Group 42"
    })
    centrality_df = centrality_df.sort_values(["Class", "Community"],
                                              ascending=False)

    # Node user ids with the 7 highest in degrees
    populars = list(
        centrality_df.sort_values("In-Degree",
                                  ascending=False)["user.id"].iloc[:7])

    # Plotting the centrality scores against each other...
    pal = [
        sns.color_palette("tab10")[7],
        sns.color_palette("tab10")[0],
        sns.color_palette("tab10")[1]
    ]
    fig, axes = plt.subplots(1, 2, figsize=(8, 3.1), sharey=True)
    fig.suptitle('Comparing Centrality Measures for the Georgia Reply Network',
                 fontweight="bold")
    # In degree vs Out degree
    sns.scatterplot(
        ax=axes[0],
        data=centrality_df[centrality_df["user.id"].isin(populars) == False],
        y="In-Degree",
        x="Out-Degree",
        hue="Class",
        s=120,
        alpha=0.9,
        palette=pal,
        style="Community")
    # In degree vs PageRank
    sns.scatterplot(
        ax=axes[1],
        data=centrality_df[centrality_df["user.id"].isin(populars) == False],
        y="In-Degree",
        x="PageRank",
        hue="Class",
        s=120,
        alpha=0.9,
        palette=pal,
        style="Community")

    # Legend
    handles, labels = axes[0].get_legend_handles_labels()
    new_handles = [handles[i] for i in [0, 2, 3, 1, 4, 6, 7, 5]]
    new_labels = [labels[i] for i in [0, 2, 3, 1, 4, 6, 7, 5]]
    axes[0].legend([], [], frameon=False)
    axes[1].legend([], [], frameon=False)
    fig.legend(new_handles,
               new_labels,
               loc="center left",
               bbox_to_anchor=[0.67, 0.5])

    # Names of axes
    axes[0].set_ylabel("In-Degree Centrality", fontweight="bold")
    #axes[1].set_ylabel("In-Degree Centrality", fontweight = "bold")
    axes[0].set_xlabel("Out-Degree Centrality", fontweight="bold")
    axes[1].set_xlabel("PageRank Centrality", fontweight="bold")

    # Adjusting plot size to accommodate legend: right determines how much space is left for the legend - e.g. right = 0.8 leaves 80% of space for legend
    plt.subplots_adjust(right=0.69, wspace=0.04, hspace=0.1)

    #plt.savefig(figure_path + "ga_centrality_measures.pdf", bbox_inches = "tight")

    plt.show()
Esempio n. 26
0
for col in df2.columns:
    logReturns2[col] = np.log(df2[col]).diff(-1)

# =============================================================================
# edge, nodes准备
# =============================================================================

corrMatrix2 = logReturns2.corr()
edges2 = corrMatrix2.stack().reset_index()
edges2.columns = ['theOne', 'theOther', 'correlation']
# remove self correlations
# list, 含 pairwise correlation信息
edges2 = edges2.loc[edges2['theOne'] != edges2['theOther']].copy()
# undirected graph with weights corresponding to the correlation magnitude
G2 = nx.from_pandas_edgelist(edges2,
                             'theOne',
                             'theOther',
                             edge_attr=['correlation'])

print(nx.info(G2))
#%%


def get_density(G):
    # How many possible edges?
    possible_edges = len(G.nodes) * (len(G.nodes) - 1) / 2
    actual_edges = len(G.edges)
    return actual_edges / possible_edges


print('density: ', get_density(G2))
print('node connectivity: ', nx.node_connectivity(G2))
Esempio n. 27
0
    cross = series.apply(lambda x: list(itertools.combinations(x, 2)))

    lists = [item for sublist in cross for item in sublist]
    source = [i[0] for i in lists]
    target = [i[1] for i in lists]
    edges = pd.DataFrame({"source": source, "target": target})
    edges["weight"] = 1
    return edges.groupby(by=["source", "target"],
                         as_index=False)["weight"].sum()


df_edges = get_edges(data=df, column="CPC Class - DWPI")

g = nx.from_pandas_edgelist(df_edges,
                            source="source",
                            target="target",
                            edge_attr=["weight"],
                            create_using=nx.Graph)

df = df_edges

clubs = list(df.source.unique())

people = list(df.target.unique())

dict(zip(clubs, clubs))

plt.figure(figsize=(12, 12))

# 1. Create the graph
Esempio n. 28
0
# tools for creating a graph from a pandas dataframe
import networkx as nx

# Create graph edges first
col_node_1 = 'Person_ID'
col_node_2 = 'Company_ID'
col_attr = ['start_date', 'end_date']
G = nx.from_pandas_edgelist(df = df, 
                             source=col_node_1, 
                             target=col_node_2, 
                             edge_attr=col_attr)
# OR
G = nx.from_pandas_edgelist(df = df.assign(has_worked=1), 
                             source=col_node_1, 
                             target=col_node_2, 
                             edge_attr=col_attr+['has_worked'])

# Add node labels
list_person_1 = [] # list of person with special type
G.add_nodes_from(df.loc[(df[col_node_1].isin(list_person_1)), 
	col_node_1].unique().tolist(), label='person_type_1')
G.add_nodes_from(df.loc[~(df[col_node_1].isin(list_person_1)), 
	col_node_1].unique().tolist(), label='person_type_2')

G.add_nodes_from(df[col_node_2].unique().tolist(), label='Company')

# Create links for people with same email and same telephone

# There is only one variable for email
col_edge = 'email'
emails = df[[col_node_1, col_edge]].dropna(subset=[col_edge]).drop_duplicates()
Esempio n. 29
0
        network_data=network_data.append(friend_frame)

# In[ ]:


network_data=network_data.reset_index(drop=True)
#checks
network_data.tail()

# In[ ]:


#changing the column name to suit nx import
network_data.columns=['source','target']
# Considering each (user_id,friend) pair as an edge of a graph, constructing the graph
graph=nx.from_pandas_edgelist(network_data)
# logging time
end_time=time.time()
print("Took",end_time-start_time,"s")

# In[ ]:


#credits https://www.kaggle.com/crailtap/basic-network-analysis-tutorial
#basic info
nx.info(graph)

# In[ ]:


#check density
Esempio n. 30
0
def train(
    edgelist,
    node_data,
    layer_size,
    num_samples,
    batch_size=100,
    num_epochs=10,
    learning_rate=0.005,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        layer_size: A list of number of hidden nodes in each layer
        num_samples: Number of neighbours to sample at each layer
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records")
    )
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist)

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets
    )

    # Split test set into test and validation
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes, test_targets, train_size=500, test_size=None
    )

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = GraphSAGENodeGenerator(
        G, batch_size, num_samples, seed=42
    )
    train_gen = generator.flow(train_nodes, train_targets)
    val_gen = generator.flow(val_nodes, val_targets)

    # GraphSAGE model
    model = GraphSAGE(
        layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout
    )
    # Expose the input and output sockets of the model:
    x_inp, x_out = model.default_model(flatten_output=True)

    # Snap the final estimator layer to x_out
    prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=losses.categorical_crossentropy,
        metrics=[metrics.categorical_accuracy],
    )

    # Train model
    history = model.fit_generator(
        train_gen,
        epochs=num_epochs,
        validation_data=val_gen,
        verbose=2,
        shuffle=True,
    )

    # Evaluate on test set and print metrics
    test_metrics = model.evaluate_generator(generator.flow(test_nodes, test_targets))
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions), index=node_ids
    )
    accuracy = np.mean(
        [
            "subject=" + gt_subject == p
            for gt_subject, p in zip(
                node_data["subject"], node_predictions.idxmax(axis=1)
            )
        ]
    )
    print("All-node accuracy: {:3f}".format(accuracy))

    # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("cora_example_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_example_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
    ic_ltp = 0
    ic_lta = 0
    ltp_lta = 0

    count_samples = 0

    print(nx.__version__)
    #parameters of the script
    path = '../data/icpsr/DS0001/paluck-edgelist.csv'
    globalThreshold = 1.5
    activation_ci = 0.075
    cascadeParameter = 0.5

    edgelist = pd.read_csv(path)
    G = nx.from_pandas_edgelist(edgelist, source='ID', target='PEERID')
    print(nx.info(G))
    print(nx.number_of_nodes(G))
    print(f'connected?\t{nx.is_connected(G)}')
    print(f'# of connected components:\t{nx.number_connected_components(G)}')

    components = nx.connected_components(G)
    sglist = [G.subgraph(c) for c in nx.connected_components(G)]

    gmat = []
    for g in sglist:
        gmat.append(nx.to_numpy_matrix(g, dtype=np.float))

    graph_size_list = [nx.number_of_nodes(g) for g in sglist]
    graph_size_series = pd.Series(graph_size_list)
    ordered_graph_series = graph_size_series.sort_values()
Esempio n. 32
0
    for word in most_similar:
        # Find stats for this word
        word_dice_stats = dice_significance(cooc, word, key_words)
        word_dice_stats = dict_value_sort(word_dice_stats)
        # Choose top nearby matches
        top_neighbours = list(word_dice_stats.keys())[0:10]
        layer2_names += top_neighbours
        new_graph_data = [{"from":word.upper(), "to":set_name, "stat":word_dice_stats[set_name]} for set_name in top_neighbours]
        # Add to existing graph data
        graph_data += new_graph_data

# Convert graph data to pandas dataframe
gd = pd.DataFrame.from_dict(graph_data)
# Create co-occurance graph
# G = nx.from_numpy_matrix(cooc)
G = nx.from_pandas_edgelist(gd, "from", "to", "stat")

# Generate colours
colours, sizes = [], []
l0, l1, l2 = {}, {}, {}
for node in G:
    if node in ALL_SEARCH_TERMS.upper():
        col = 'darkblue' #'red'
        size = counts[node]*1000 #5000
        l0[node] = node
    elif node in layer1_names:
        col = 'lightblue' #'orange'
        size = counts[node]*1000 #2500
        l1[node] = node
    else:
        col = 'cyan' #'blue'
Esempio n. 33
0
def download_signor():
    col_names = [
        'ENTITYA', 'TYPEA', 'IDA', 'DATABASEA', 'ENTITYB', 'TYPEB', 'IDB',
        'DATABASEB', 'EFFECT', 'MECHANISM', 'RESIDUE', 'SEQUENCE', 'TAX_ID',
        'CELL_DATA', 'TISSUE_DATA', 'MODULATOR_COMPLEX', 'TARGET_COMPLEX',
        'MODIFICATIONA', 'MODASEQ', 'MODIFICATIONB', 'MODBSEQ', 'PMID',
        'DIRECT', 'SENTENCE', 'SIGNOR_ID', 'NA1', 'NA2', 'NA3']

    table = pd.read_csv('https://signor.uniroma2.it/getData.php?organism=9606',
                        names=col_names, delimiter='\t', index_col=None,
                        error_bad_lines=False, encoding='utf-8'
                        )
    # filter out non direct
    table = table.loc[table['DIRECT'] == 't']

    # Filter out non descriptive
    table = table.loc[~table['MECHANISM'].isnull()]

    # Drop SIGNOR edges, these are generally complexes
    table = table[~(table['DATABASEA'] == 'SIGNOR')]
    table = table[~(table['DATABASEB'] == 'SIGNOR')]

    # Not sure what they mean, so will remove. Ideally other DBs have this info
    table = table[~(table['MECHANISM'] == 'post transcriptional regulation')]

    col_a = ['ENTITYA', 'TYPEA', 'IDA', 'DATABASEA']
    col_b = ['ENTITYB', 'TYPEB', 'IDB', 'DATABASEB']
    cols = ['name', 'species_type', 'id', 'db']
    species_a = table[col_a].copy()
    species_b = table[col_b].copy()
    species_a.rename(columns={i: j for i, j in zip(col_a, cols)}, inplace=True)
    species_b.rename(columns={i: j for i, j in zip(col_b, cols)}, inplace=True)
    species_a.drop_duplicates(inplace=True)
    species_b.drop_duplicates(inplace=True)
    all_species = pd.concat([species_a, species_b])
    all_species.drop_duplicates(inplace=True)

    def map_to_activate_inhibit(row):
        effect = ''
        mechanism = row['MECHANISM']
        if 'down-regulates' in row['EFFECT']:
            effect = 'inhibit'
        elif 'up-regulates' in row['EFFECT']:
            effect = 'activate'
        if mechanism in edge_standards:
            mechanism = edge_standards[mechanism]
        elif mechanism == 'transcriptional regulation':
            if effect == 'inhibit':
                mechanism = 'repression'
            elif effect == 'activate':
                mechanism = 'expression'
        if effect == '':
            return mechanism
        else:
            return "|".join([effect, mechanism])

    # relabel edge types
    table['interactionType'] = table.apply(map_to_activate_inhibit, axis=1)
    table['databaseSource'] = 'SIGNOR'
    table['pmid'] = table['PMID']

    table['source'] = table['ENTITYA']
    table['target'] = table['ENTITYB']

    protein_graph = nx.from_pandas_edgelist(
        table,
        'source',
        'target',
        edge_attr=['interactionType', 'databaseSource'],
        create_using=nx.DiGraph()
    )
    # add names to graph
    for row in all_species.values:
        name, species_type, id_name, db = row
        if species_type != 'protein':
            species_type = 'compound'
        if species_type == 'protein':
            species_type = 'gene'

        protein_graph.add_node(name, databaseSource='SIGNOR',
                               speciesType=species_type)

    nx.write_gpickle(protein_graph, _p_name)
Esempio n. 34
0
elast = elast[elast.direction != "zero"]
elast.effector = elast.effector.str.replace("EX_", "").str.replace("_m", "")

for sa in samples:
    sns.kdeplot(elast[elast.id == sa].elasticity, bw=4,
                shade=True, label=sa)
plt.legend()
plt.xlabel("elasticity [a.u.]")
plt.ylabel("density")
plt.savefig("elast_densities.svg")
plt.close()

for sa in samples:
    e = elast[elast.id == sa].copy()
    e = e[(e.elasticity.abs() > 0.5) & (e.norm_elasticity.abs() > 0.5)]
    graph = nx.from_pandas_edgelist(e, source="effector", target="reaction",
                                    edge_attr="elasticity")
    for idx, _ in graph.nodes(data=True):
        if idx.startswith("EX_"):
            d = direction(e, idx)
            cl = "import flux" if d == "forward" else "export flux"
        elif idx[0].isupper():
            cl = "abundance"
        else:
            cl = "diet"
        graph.node[idx]["class"] = cl

    circos = nxviz.CircosPlot(graph, node_labels=True, rotate_labels=True,
                              edge_color="elasticity", edge_cmap="bwr",
                              edge_limits=(-150, 150), node_color="class",
                              node_grouping="class", node_order="class",
                              figsize=(20, 18))
Esempio n. 35
0
    def process(self):
        if not all(osp.exists(path) for path in self.processed_paths):
            helpers.log("Processing")
            names = ["Users", "Items", self.timeattribute]
            data = read_interaction(self.raw_paths.interactions, names=names)

            # Relabel users and items (
            #   user_ids : 1, ..., len(users) + 1
            #   item_ids : len(users), ..., len(users) + len(items) + 1)
            users = sorted(data.Users.unique())
            items = sorted(data.Items.unique())
            helpers.log(f"Number of users {len(users)}")
            helpers.log(f"Number of items {len(items)}")
            helpers.log(f"Number of nodes {len(users) + len(items)}")
            user_ids = range(1, len(users) + 1)
            item_ids = range(max(user_ids) + 1, max(user_ids) + len(items) + 1)
            user2id = dict(zip(users, user_ids))
            item2id = dict(zip(items, item_ids))
            data.Users = data.Users.apply(lambda l: user2id[l])
            data.Items = data.Items.apply(lambda l: item2id[l])
            
            self.users = sorted(data.Users.unique())
            self.items = sorted(data.Items.unique())

            # Create an interaction graph
            interaction_graph = nx.from_pandas_edgelist(
                data[names],
                source="Users",
                target="Items",
                create_using=nx.MultiDiGraph,
                edge_attr=self.timeattribute,
            )
            interaction_graph.add_nodes_from(self.users + self.items)

            # Initialize complete interaction history
            self.init_neighborhood(interaction_graph=interaction_graph)

            # Convert interaction graph to list of interactions sorted by time
            interactions = sorted(
                interaction_graph.edges(data=True),
                key=lambda l: l[2][self.timeattribute],
            )

            # identify the indices for train, valid, and test splits
            train_size = int(data.shape[0] * self.train_rate)
            valid_size = int(data.shape[0] * self.valid_rate)

            train_indices = range(train_size)
            valid_indices = range(train_size, train_size + valid_size)
            test_indices = range(train_size + valid_size, data.shape[0])

            torch.save(
                (interactions, self.neighborhood, self.timestamps),
                self.processed_paths.interactions)
            torch.save((self.users, self.items), self.processed_paths.nodes)
            torch.save(
                (train_indices, valid_indices, test_indices),
                self.processed_paths.splits)
            helpers.log("Done!")

        self.interactions, self.neighborhood, self.timestamps = torch.load(
            self.processed_paths.interactions
        )
        self.users, self.items = torch.load(self.processed_paths.nodes)
        self.train_split, self.valid_split, self.test_split = torch.load(
            self.processed_paths.splits
        )
        self.__indices__ = range(len(self.interactions))
Esempio n. 36
0
    return res_dct

#run and append @Dataset for each node the max node CC with the value CC
ccNodeVal(G,cc)

#Now we can  create the initial popoulation is the node and ccNodeValq`1

#neibors is the gen we can do matuation and new
#these are
neighbors = {node: list(G.neighbors(node)) for node in G.nodes()}
nodes = list(G.nodes())


#graph node and mmCCnode
#graph node and mmCCnode
G2 = nx.from_pandas_edgelist(cc, 'node', 'mmCCNode')

# find the conencted componets of the new graph and is the chromosome
#the initial popoulation with2  is concomp
concomp=list(list(nx.connected_components(G2)))

numCluster=len(concomp)

nx.draw_networkx(G2)
plt.show()

#communities = {node: community for community, node in enumerate(neighbors.keys())}
from quality import modularity

#modularity for all chromosume
mod1=modularity(G2,concomp)
df_events_attendance = pd.read_csv(EVENTS_ATTENDANCE_CLEANED)
# counting the number events attended per individual - degree - attribute
df_degree = df_events_attendance.groupby([SURNAME, FULLNAME],
                                         as_index=False).count()
# change name of column header
df_degree.rename(columns={EVENT_ID: "eventscount"}, inplace=True)
# get family size
df_family = df_degree[[SURNAME, FULLNAME]].groupby(SURNAME,
                                                   as_index=False).count()
# color - family size. 1 is a single individual
df_family.rename(columns={FULLNAME: "familysize"}, inplace=True)
#df_family[df_family["color"] > 1]
# maybe not df_degree = df_degree.merge(df_family, on = [SURNAME])
# create graph object from pd df_events_attendanceframe
B = nx.from_pandas_edgelist(df=df_events_attendance,
                            source=SURNAME,
                            target=EVENT_ID)
# get the two bipartite sets - x will be the indivuals, y the events
X, Y = nx.bipartite.sets(B)
# convert the bipartite graph to a weighted graph of common participation to an event
G = nx.algorithms.bipartite.weighted_projected_graph(B, X)
# set number of events attended as node attribute
df_degree = df_degree.groupby(SURNAME, as_index=False).sum()
df_degree.set_index(SURNAME, inplace=True)
nx.set_node_attributes(
    G,
    pd.Series(df_degree["eventscount"], index=df_degree.index).to_dict(),
    "eventscount")
# set family size as attribute
df_family.set_index(SURNAME, inplace=True)
nx.set_node_attributes(
Esempio n. 38
0
import pandas as pd
import networkx as nx

df = pd.read_csv("Data/waiting_list.csv")
df.sort_values("Datetime", inplace=True)

G = nx.from_pandas_edgelist(df, "From", "To", edge_attr=True,
                            create_using=nx.MultiDiGraph)
students = df.shape[0]

# Returns candidate swapchains
swaps, chain_no = [], 1
for cycle in list(nx.simple_cycles(G)):
    n = len(cycle)
    step = 1
    for u, v in zip(cycle, cycle[1:] + [cycle[0]]):
        for i in range(students):
            if df.iloc[i,1] == u and df.iloc[i,2] == v:
                swaps.append({"chain_no": chain_no, 
                                "chain_size": n,
                                "Step": step,
                                "Student": df.iloc[i,0],
                                "From": u,
                                "To": v})
                step +=1
                break
    chain_no += 1

# Sort and filter output csv
output = pd.DataFrame(swaps)
output.sort_values(["chain_size", "chain_no", "Step"],
Esempio n. 39
0
def to_networkx_graph(data, create_using=None, multigraph_input=False):
    """Make a NetworkX graph from a known data structure.

    The preferred way to call this is automatically
    from the class constructor

    >>> d = {0: {1: {'weight':1}}} # dict-of-dicts single edge (0,1)
    >>> G = nx.Graph(d)

    instead of the equivalent

    >>> G = nx.from_dict_of_dicts(d)

    Parameters
    ----------
    data : object to be converted

        Current known types are:
         any NetworkX graph
         dict-of-dicts
         dict-of-lists
         list of edges
         Pandas DataFrame (row per edge)
         numpy matrix
         numpy ndarray
         scipy sparse matrix
         pygraphviz agraph

    create_using : NetworkX graph constructor, optional (default=nx.Graph)
        Graph type to create. If graph instance, then cleared before populated.

    multigraph_input : bool (default False)
        If True and  data is a dict_of_dicts,
        try to create a multigraph assuming dict_of_dict_of_lists.
        If data and create_using are both multigraphs then create
        a multigraph from a multigraph.

    """
    # NX graph
    if hasattr(data, "adj"):
        try:
            result = from_dict_of_dicts(data.adj,
                                        create_using=create_using,
                                        multigraph_input=data.is_multigraph())
            if hasattr(data, 'graph'):  # data.graph should be dict-like
                result.graph.update(data.graph)
            if hasattr(data, 'nodes'):  # data.nodes should be dict-like
                result._node.update((n, dd.copy()) for n, dd in data.nodes.items())
            return result
        except:
            raise nx.NetworkXError("Input is not a correct NetworkX graph.")

    # pygraphviz  agraph
    if hasattr(data, "is_strict"):
        try:
            return nx.nx_agraph.from_agraph(data, create_using=create_using)
        except:
            raise nx.NetworkXError("Input is not a correct pygraphviz graph.")

    # dict of dicts/lists
    if isinstance(data, dict):
        try:
            return from_dict_of_dicts(data, create_using=create_using,
                                      multigraph_input=multigraph_input)
        except:
            try:
                return from_dict_of_lists(data, create_using=create_using)
            except:
                raise TypeError("Input is not known type.")

    # list or generator of edges

    if (isinstance(data, (list, tuple)) or
            any(hasattr(data, attr) for attr in ['_adjdict', 'next', '__next__'])):
        try:
            return from_edgelist(data, create_using=create_using)
        except:
            raise nx.NetworkXError("Input is not a valid edge list")

    # Pandas DataFrame
    try:
        import pandas as pd
        if isinstance(data, pd.DataFrame):
            if data.shape[0] == data.shape[1]:
                try:
                    return nx.from_pandas_adjacency(data, create_using=create_using)
                except:
                    msg = "Input is not a correct Pandas DataFrame adjacency matrix."
                    raise nx.NetworkXError(msg)
            else:
                try:
                    return nx.from_pandas_edgelist(data, edge_attr=True, create_using=create_using)
                except:
                    msg = "Input is not a correct Pandas DataFrame edge-list."
                    raise nx.NetworkXError(msg)
    except ImportError:
        msg = 'pandas not found, skipping conversion test.'
        warnings.warn(msg, ImportWarning)

    # numpy matrix or ndarray
    try:
        import numpy
        if isinstance(data, (numpy.matrix, numpy.ndarray)):
            try:
                return nx.from_numpy_matrix(data, create_using=create_using)
            except:
                raise nx.NetworkXError(
                    "Input is not a correct numpy matrix or array.")
    except ImportError:
        warnings.warn('numpy not found, skipping conversion test.',
                      ImportWarning)

    # scipy sparse matrix - any format
    try:
        import scipy
        if hasattr(data, "format"):
            try:
                return nx.from_scipy_sparse_matrix(data, create_using=create_using)
            except:
                raise nx.NetworkXError(
                    "Input is not a correct scipy sparse matrix type.")
    except ImportError:
        warnings.warn('scipy not found, skipping conversion test.',
                      ImportWarning)

    raise nx.NetworkXError(
        "Input is not a known data type for conversion.")
Esempio n. 40
0
 def test_from_edgelist_one_attr(self):
     Gtrue = nx.Graph([('E', 'C', {'weight': 10}),
                       ('B', 'A', {'weight': 7}),
                       ('A', 'D', {'weight': 4})])
     G = nx.from_pandas_edgelist(self.df, 0, 'b', 'weight')
     assert_graphs_equal(G, Gtrue)
Esempio n. 41
0
import pandas as pd
import networkx as nx

wd = '/Users/ewenwang/Documents/practice_data/conversion_rate/'
file = ['round1_ijcai_18_train_20180301.txt', 'round1_ijcai_18_test_a_20180301.txt', 'round1_ijcai_18_test_b_20180418.txt']
    
print('loading...')
train = pd.read_csv(wd+file[0], sep=" ")
test_a = pd.read_csv(wd+file[1], sep=" ")
test_b = pd.read_csv(wd+file[2], sep=" ")
data = pd.concat([train, test_a, test_b])

print('graph generating...')
G_ui = nx.from_pandas_edgelist(df=data, source='user_id', target='item_id', edge_attr='is_trade', create_using=nx.MultiGraph())

pagerank = pd.DataFrame(list(nx.pagerank(G_ui).items()), columns=['node', 'pagerank'])

print('merging...')
data = data.merge(pagerank, left_on='user_id', right_on='node', how='left').merge(pagerank, left_on='item_id', right_on='node', how='left')

pagerank_data = pd.DataFrame(columns=['instance_id', 'user_pagerank', 'item_pagerank'])
pagerank_data['instance_id'] = data['instance_id']
pagerank_data['user_pagerank'] = data['pagerank_x']
pagerank_data['item_pagerank'] = data['pagerank_y']

print('saving...')
pagerank_data.to_csv(wd+'pagerank_union.txt', index=False, sep=' ')
Esempio n. 42
0
def build_network(df, edges, nodes):

    ##### BUILDING NETWORK ######

    column_edge = edges
    column_ID = nodes

    data_to_merge = df[[column_ID, column_edge]].dropna(
        subset=[column_edge]).drop_duplicates()  # select columns, remove NaN
    #data_to_merge = df[[column_ID, column_edge]]

    # To create connections between people who have the same number,
    # join data with itself on the 'ID' column.
    data_to_merge = data_to_merge.merge(data_to_merge[[
        column_ID, column_edge
    ]].rename(columns={column_ID: column_ID + "_2"}),
                                        on=column_edge)

    # By joining the data with itself, people will have a connection with themselves.
    # Remove self connections, to keep only connected people who are different.
    d = data_to_merge[~(data_to_merge[column_ID]==data_to_merge[column_ID+"_2"])] \
        .dropna()[[column_ID, column_ID+"_2", column_edge]]

    # To avoid counting twice the connections (person 1 connected to person 2 and person 2 connected to person 1)
    # we force the first ID to be "lower" then ID_2
    d.drop(d.loc[d[column_ID + "_2"] < d[column_ID]].index.tolist(),
           inplace=True)
    print('pre pro done...')

    #########################

    G = nx.from_pandas_edgelist(df=d,
                                source=column_ID,
                                target=column_ID + '_2',
                                edge_attr=column_edge)
    G.add_nodes_from(nodes_for_adding=df.class_name.tolist())
    #G.add_nodes_from(nodes_for_adding=list(df[nodes].values()))
    print('#nodes:', len(G.nodes()), 'and', '#edges:', len(G.edges()))

    degrees = [val for (node, val) in G.degree()]
    np.save("data/degrees.npy", degrees)
    degree_values = sorted(set(degrees))
    histogram = [
        degrees.count(i) / float(nx.number_of_nodes(G)) for i in degree_values
    ]

    fig, ax = plt.subplots()
    # the histogram of the data
    n, bins, patches = plt.hist(degrees, 50)
    #plt.bar(range(len(histogram)),histogram)
    #plt.xticks(range(len(histogram)), degree_values)
    plt.xlabel('Degree')
    plt.ylabel('Fraction of Nodes')
    plt.xlim(0, max(degree_values))
    #plt.xscale('log')
    plt.yscale('log')
    plt.tight_layout()
    plt.savefig("plot/nodes_degress.pdf")

    fig, ax = plt.subplots()
    plt.plot(range(len(histogram)), sorted(histogram, reverse=True), 'o')
    plt.xticks(range(len(histogram)), degree_values)
    plt.xlabel('Degree')
    plt.ylabel('Fraction of Nodes')
    plt.xscale('log')
    plt.yscale('log')
    plt.tight_layout()
    plt.savefig("plot/power_law.pdf")

    closeness_centrality = nx.closeness_centrality(G)
    #print(closeness_centrality)
    write_dict(closeness_centrality, "data/closeness_centrality.csv")
    print('closeness done...')

    betweenness_centrality = nx.betweenness_centrality(G)
    #print(betweenness_centrality)
    write_dict(betweenness_centrality, "data/betweenness_centrality.csv")
    print('betweenness done...')

    degree_centrality = nx.degree_centrality(G)
    #print(degree_centrality)
    write_dict(degree_centrality, "data/degree_centrality.csv")
    print('centrality done...')

    fig, ax = plt.subplots()
    # the histogram of the data
    nx.draw(G)
    #plt.bar(range(len(histogram)),histogram)
    #plt.xticks(range(len(histogram)), degree_values)
    plt.tight_layout()
    #plt.show()
    plt.savefig("plot/net.pdf")
def makeGraph(request, df_enron):
    G = networkx.from_pandas_edgelist(df_enron,
                                      'fromId',
                                      'toId',
                                      edge_attr=True)

    di = {
        'CEO': 1,
        'Director': 2,
        'Employee': 3,
        'In House Lawyer': 4,
        'Manager': 5,
        'Managing Director': 6,
        'President': 7,
        'Trader': 8,
        'Unknown': 9,
        'Vice President': 10
    }
    df_rejob = df_enron.replace({"fromJobtitle": di})
    df_attributes = df_enron[['fromId', 'fromJobtitle',
                              'fromEmail']].drop_duplicates()
    df_attributes.columns = ['fromId', 'job', 'fromEmail']
    df_attributesx = df_rejob[['fromId', 'fromJobtitle',
                               'fromEmail']].drop_duplicates()
    job = df_attributes.set_index('fromId').to_dict('i')
    jobx = df_attributesx.set_index('fromId').to_dict('i')
    fromEmail = df_attributes.set_index('fromEmail').to_dict('i')
    networkx.set_node_attributes(G, job)
    networkx.set_node_attributes(G, jobx)
    networkx.set_node_attributes(G, fromEmail)
    #jobs = ['Employee','Vice President','Unknown','Manager','CEO','Trader','Director','President','Managing Director','In House Lawyer']

    degrees = dict(networkx.degree(G))
    networkx.set_node_attributes(G, name='degree', values=degrees)
    adjusted_node_size = dict([(node, (degree + 5) - ((degree + 5) * 0.3))
                               for node, degree in networkx.degree(G)])
    networkx.set_node_attributes(G,
                                 name='adjusted_node_size',
                                 values=adjusted_node_size)

    size_by_this_attribute = 'adjusted_node_size'
    color_by_this_attribute = 'fromJobtitle'

    color_palette = Category10[10]

    TOOLTIPS = [
        ("Person ID", "@index"),
        ("Email", "@fromEmail"),
        ("people communicated with", "@degree"),
        ("Jobtitle", "@job"),
    ]

    graph_size = int(request.POST.get('graph_size', '720'))
    plot = figure(tooltips=TOOLTIPS,
                  tools="pan,zoom_in,wheel_zoom,save,reset,box_select,undo",
                  active_scroll='wheel_zoom',
                  x_range=Range1d(-20, 20),
                  y_range=Range1d(-20, 20),
                  title='Enron Emails',
                  plot_width=graph_size,
                  plot_height=graph_size)
    plot.axis.visible = False

    N_graph = from_networkx(G, networkx.spring_layout, scale=100)

    N_graph.node_renderer.glyph = Circle(size=size_by_this_attribute,
                                         fill_color=linear_cmap(
                                             color_by_this_attribute,
                                             color_palette, 1, 10))

    N_graph.edge_renderer.glyph = MultiLine(line_alpha=10, line_width=1)

    plot.renderers.append(N_graph)

    item_text = json.dumps(json_item(plot))

    return item_text
Esempio n. 44
0
def fullSizeGraph(request):
    import pandas as pd
    import networkx
    import matplotlib.pyplot as plt
    import numpy as np

    df_enron = filterDataByTime(request,
                                pd.read_csv(request.FILES['csv_data']))

    #from bokeh.io import output_notebook, show, save
    from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine
    from bokeh.plotting import figure
    from bokeh.models.graphs import from_networkx
    from bokeh.palettes import Category10
    from bokeh.transform import linear_cmap
    from bokeh.embed import json_item

    #output_notebook() #remove this when not using notebook

    G = networkx.from_pandas_edgelist(df_enron,
                                      'fromId',
                                      'toId',
                                      edge_attr=True)

    di = {
        'CEO': 1,
        'Director': 2,
        'Employee': 3,
        'In House Lawyer': 4,
        'Manager': 5,
        'Managing Director': 6,
        'President': 7,
        'Trader': 8,
        'Unknown': 9,
        'Vice President': 10
    }
    df_rejob = df_enron.replace({"fromJobtitle": di})
    df_attributes = df_enron[['fromId', 'fromJobtitle']].drop_duplicates()
    df_attributes.columns = ['fromId', 'job']
    df_attributesx = df_rejob[['fromId', 'fromJobtitle']].drop_duplicates()
    job = df_attributes.set_index('fromId').to_dict('i')
    jobx = df_attributesx.set_index('fromId').to_dict('i')
    networkx.set_node_attributes(G, job)
    networkx.set_node_attributes(G, jobx)
    #jobs = ['Employee','Vice President','Unknown','Manager','CEO','Trader','Director','President','Managing Director','In House Lawyer']

    degrees = dict(networkx.degree(G))
    networkx.set_node_attributes(G, name='degree', values=degrees)
    adjusted_node_size = dict([(node, (degree + 5) - ((degree + 5) * 0.3))
                               for node, degree in networkx.degree(G)])
    networkx.set_node_attributes(G,
                                 name='adjusted_node_size',
                                 values=adjusted_node_size)

    size_by_this_attribute = 'adjusted_node_size'
    color_by_this_attribute = 'fromJobtitle'

    color_palette = Category10[10]

    TOOLTIPS = [
        ("Person ID", "@index"),
        ("people communicated with", "@degree"),
        ("Jobtitle", "@job"),
    ]

    plot = figure(tooltips=TOOLTIPS,
                  tools="pan,zoom_in,wheel_zoom,save,reset,box_select,undo",
                  active_scroll='wheel_zoom',
                  x_range=Range1d(-20, 20),
                  y_range=Range1d(-20, 20),
                  title='Enron Emails',
                  plot_width=950,
                  plot_height=950)
    plot.axis.visible = False

    N_graph = from_networkx(G, networkx.spring_layout, scale=100)

    N_graph.node_renderer.glyph = Circle(size=size_by_this_attribute,
                                         fill_color=linear_cmap(
                                             color_by_this_attribute,
                                             color_palette, 1, 10))

    N_graph.edge_renderer.glyph = MultiLine(line_alpha=10, line_width=1)

    plot.renderers.append(N_graph)

    item_text = json.dumps(json_item(plot))

    return django.http.JsonResponse(item_text, safe=False)
Esempio n. 45
0
for k, v in vt_copy.items():
    if len(v) == 0:
        del visualisation_table[k]
network_table = {}
i = 0
for k, v in visualisation_table.items():
    for c_to_c in v:
        network_table[i] = c_to_c
        i += 1
for k, v in network_table.items():
    network_table[k] = sorted(v)
network_df = pd.DataFrame(network_table)
network_df = network_df.transpose()
network_df.columns = ['c1', 'c2']
plt.figure(figsize=(12, 12))
g = nx.from_pandas_edgelist(network_df, source='c1', target='c2')
nx.draw_networkx(g)
plt.show()

#abstract
abstract_words = data_13_20['Abstract']
abs_dict = {}
for i, words in enumerate(abstract_words):
    if type(words) == str:
        words = words.replace(';', '')
        words = stop_word(words, stop_words)
        abs_dict[i] = words
    else:
        pass

#keywords
Esempio n. 46
0
def network_graph(yearRange, AccountToSearch):

    edge1 = pd.read_csv('edge1.csv')
    node1 = pd.read_csv('node1.csv')

    # filter the record by datetime, to enable interactive control through the input box
    edge1['Datetime'] = ""  # add empty Datetime column to edge1 dataframe
    accountSet = set()  # contain unique account
    for index in range(0, len(edge1)):
        edge1['Datetime'][index] = datetime.strptime(edge1['Date'][index],
                                                     '%d/%m/%Y')
        if edge1['Datetime'][index].year < yearRange[0] or edge1['Datetime'][
                index].year > yearRange[1]:
            edge1.drop(axis=0, index=index, inplace=True)
            continue
        accountSet.add(edge1['Source'][index])
        accountSet.add(edge1['Target'][index])

    # to define the centric point of the networkx layout
    shells = []
    shell1 = []
    shell1.append(AccountToSearch)
    shells.append(shell1)
    shell2 = []
    for ele in accountSet:
        if ele != AccountToSearch:
            shell2.append(ele)
    shells.append(shell2)

    G = nx.from_pandas_edgelist(edge1,
                                'Source',
                                'Target',
                                ['Source', 'Target', 'TransactionAmt', 'Date'],
                                create_using=nx.MultiDiGraph())
    nx.set_node_attributes(
        G,
        node1.set_index('Account')['CustomerName'].to_dict(), 'CustomerName')
    nx.set_node_attributes(G,
                           node1.set_index('Account')['Type'].to_dict(),
                           'Type')
    # pos = nx.layout.spring_layout(G)
    # pos = nx.layout.circular_layout(G)
    # nx.layout.shell_layout only works for more than 3 nodes
    if len(shell2) > 1:
        pos = nx.drawing.layout.shell_layout(G, shells)
    else:
        pos = nx.drawing.layout.spring_layout(G)
    for node in G.nodes:
        G.nodes[node]['pos'] = list(pos[node])

    if len(shell2) == 0:
        traceRecode = []  # contains edge_trace, node_trace, middle_node_trace

        node_trace = go.Scatter(x=tuple([1]),
                                y=tuple([1]),
                                text=tuple([str(AccountToSearch)]),
                                textposition="bottom center",
                                mode='markers+text',
                                marker={
                                    'size': 50,
                                    'color': 'LightSkyBlue'
                                })
        traceRecode.append(node_trace)

        node_trace1 = go.Scatter(x=tuple([1]),
                                 y=tuple([1]),
                                 mode='markers',
                                 marker={
                                     'size': 50,
                                     'color': 'LightSkyBlue'
                                 },
                                 opacity=0)
        traceRecode.append(node_trace1)

        figure = {
            "data":
            traceRecode,
            "layout":
            go.Layout(title='Interactive Transaction Visualization',
                      showlegend=False,
                      margin={
                          'b': 40,
                          'l': 40,
                          'r': 40,
                          't': 40
                      },
                      xaxis={
                          'showgrid': False,
                          'zeroline': False,
                          'showticklabels': False
                      },
                      yaxis={
                          'showgrid': False,
                          'zeroline': False,
                          'showticklabels': False
                      },
                      height=600)
        }
        return figure

    traceRecode = []  # contains edge_trace, node_trace, middle_node_trace
    ############################################################################################################################################################
    colors = list(
        Color('lightcoral').range_to(Color('darkred'), len(G.edges())))
    colors = ['rgb' + str(x.rgb) for x in colors]

    index = 0
    for edge in G.edges:
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        weight = float(G.edges[edge]['TransactionAmt']) / max(
            edge1['TransactionAmt']) * 10
        trace = go.Scatter(x=tuple([x0, x1, None]),
                           y=tuple([y0, y1, None]),
                           mode='lines',
                           line={'width': weight},
                           marker=dict(color=colors[index]),
                           line_shape='spline',
                           opacity=1)
        traceRecode.append(trace)
        index = index + 1
    ###############################################################################################################################################################
    node_trace = go.Scatter(x=[],
                            y=[],
                            hovertext=[],
                            text=[],
                            mode='markers+text',
                            textposition="bottom center",
                            hoverinfo="text",
                            marker={
                                'size': 50,
                                'color': 'LightSkyBlue'
                            })

    index = 0
    for node in G.nodes():
        x, y = G.nodes[node]['pos']
        hovertext = "CustomerName: " + str(
            G.nodes[node]['CustomerName']) + "<br>" + "AccountType: " + str(
                G.nodes[node]['Type'])
        text = node1['Account'][index]
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
        node_trace['hovertext'] += tuple([hovertext])
        node_trace['text'] += tuple([text])
        index = index + 1

    traceRecode.append(node_trace)
    ################################################################################################################################################################
    middle_hover_trace = go.Scatter(x=[],
                                    y=[],
                                    hovertext=[],
                                    mode='markers',
                                    hoverinfo="text",
                                    marker={
                                        'size': 20,
                                        'color': 'LightSkyBlue'
                                    },
                                    opacity=0)

    index = 0
    for edge in G.edges:
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        hovertext = "From: " + str(
            G.edges[edge]['Source']) + "<br>" + "To: " + str(
                G.edges[edge]['Target']) + "<br>" + "TransactionAmt: " + str(
                    G.edges[edge]['TransactionAmt']
                ) + "<br>" + "TransactionDate: " + str(G.edges[edge]['Date'])
        middle_hover_trace['x'] += tuple([(x0 + x1) / 2])
        middle_hover_trace['y'] += tuple([(y0 + y1) / 2])
        middle_hover_trace['hovertext'] += tuple([hovertext])
        index = index + 1

    traceRecode.append(middle_hover_trace)
    #################################################################################################################################################################
    figure = {
        "data":
        traceRecode,
        "layout":
        go.Layout(title='Interactive Transaction Visualization',
                  showlegend=False,
                  hovermode='closest',
                  margin={
                      'b': 40,
                      'l': 40,
                      'r': 40,
                      't': 40
                  },
                  xaxis={
                      'showgrid': False,
                      'zeroline': False,
                      'showticklabels': False
                  },
                  yaxis={
                      'showgrid': False,
                      'zeroline': False,
                      'showticklabels': False
                  },
                  height=600,
                  clickmode='event+select',
                  annotations=[
                      dict(ax=(G.nodes[edge[0]]['pos'][0] +
                               G.nodes[edge[1]]['pos'][0]) / 2,
                           ay=(G.nodes[edge[0]]['pos'][1] +
                               G.nodes[edge[1]]['pos'][1]) / 2,
                           axref='x',
                           ayref='y',
                           x=(G.nodes[edge[1]]['pos'][0] * 3 +
                              G.nodes[edge[0]]['pos'][0]) / 4,
                           y=(G.nodes[edge[1]]['pos'][1] * 3 +
                              G.nodes[edge[0]]['pos'][1]) / 4,
                           xref='x',
                           yref='y',
                           showarrow=True,
                           arrowhead=3,
                           arrowsize=4,
                           arrowwidth=1,
                           opacity=1) for edge in G.edges
                  ])
    }
    return figure
Esempio n. 47
0
 def test_from_edgelist_no_attr(self):
     Gtrue = nx.Graph([('E', 'C', {}),
                       ('B', 'A', {}),
                       ('A', 'D', {})])
     G = nx.from_pandas_edgelist(self.df, 0, 'b',)
     assert_graphs_equal(G, Gtrue)
Esempio n. 48
0
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

edge_list = pd.read_csv('stack_network_links.csv')

G = nx.from_pandas_edgelist(edge_list)

plt.figure(figsize=(20, 20))

nx.draw(G, with_labels=True,
        edge_color='grey',
        node_color='blue',
        node_size=10,
        pos=nx.spring_layout(G, k=0.2, iterations=50))
# iterations 迭代优化,没有将距离相近的节点拉近。
# k是每个节点之间的距离【0,1】越大距离越远。

plt.show()