Beispiel #1
0
def main():
    if len(sys.argv) != 2:
        print('File name must be provided')
        sys.exit(-1)

    filename = sys.argv[1]
    filenameWOExt = filename.split('.')[0].split('/')[-1]
    graph = snap.LoadEdgeList(snap.PNEANet, filename, 0, 1, '\t')
    snap.PrintInfo(graph, "New York", filenameWOExt + '_info.txt', False)

    Rnd = snap.TRnd(123124)
    erdosRenyi = snap.GenRndGnm(snap.PNEANet, graph.GetNodes(),
                                graph.GetEdges(), True, Rnd)
    snap.PrintInfo(erdosRenyi, "Erdos-Renyi", 'erdos_renyi_info.txt', False)

    grid = snap.GenGrid(snap.PNEANet, 220, 250, False)
    snap.PrintInfo(grid, "Grid", 'grid_info.txt', False)

    printGenericInformation(graph, 'New York street network')
    printGenericInformation(erdosRenyi, 'Erdos-Renyi random graph')
    printGenericInformation(grid, 'Grid random graph')

    # Plot everything in the plots directory
    os.chdir(os.path.join(os.path.abspath(sys.path[0]), 'plots'))
    saveDegreeDistribution(graph, 'deg_dist_ny.tab')
    saveDegreeDistribution(erdosRenyi, 'deg_dist_er.tab')
    saveDegreeDistribution(grid, 'deg_dist_gr.tab')

    testRobustnessAll([graph, erdosRenyi, grid])

    call(['gnuplot', 'deg_dist.plt'])
    call(['gnuplot', 'robustness_rand.plt'])
    call(['gnuplot', 'robustness_max.plt'])
Beispiel #2
0
def main ():
    import json
    import snap
    import graphviz
    import matplotlib.pyplot as plt
    import numpy as np
    import xlrd
    #-----------------
    #The common area
    rumor_number = "21"

    path_input = 'D:\\Papers\\Social Network Mining\\Analysis_of_Rumor_Dataset\\Step 18\\Rumor_'+ rumor_number +'\\Input\\'
    workbook_input1_D = xlrd.open_workbook(path_input + 'DATASET.xlsx', on_demand = True)
    
    path_jsonl = 'D:\\Papers\\Social Network Mining\\Analysis_of_Rumor_Dataset\\Step 18\\Rumor_'+ rumor_number +'\\Input\\Rumor_' + rumor_number + '.jsonl'
    path_graph = 'D:\\Papers\\Social Network Mining\\Analysis_of_Rumor_Dataset\\Step 18\\Rumor_'+ rumor_number +'\\Input\\Rumor_' + rumor_number + '.graph'

    path_output  = 'D:\\Papers\\Social Network Mining\\Analysis_of_Rumor_Dataset\\Step 18\\Rumor_'+ rumor_number +'\\Output\\'

    FIn = snap.TFIn(path_graph)
    G_Directed = snap.TNGraph.Load(FIn)        
    G_Directed_with_Attributes = snap.ConvertGraph(snap.PNEANet, G_Directed) #Convert Directed Graph to Directed Graph with attributes: it means now we can assign attributes to the graph nodes
    G_Directed_with_Attributes = Get_Graph_with_Attributes_New (path_jsonl, G_Directed_with_Attributes, workbook_input1_D)
    #-----------------
    #The specific area
    snap.PrintInfo(G_Directed_with_Attributes, "Python type PNEANet", path_output + "S18_5_Output.txt", False)
def transform_directed_to_undirected():
    GUn = snap.ConvertGraph(snap.PUNGraph, G)
    snap.PrintInfo(GUn, "Tweets UN stats", "Tweets_UN_info.txt", False)
    f = open('Tweets_UN_info.txt', 'r')
    file_contents = f.read()
    #print(file_contents)
    f.close()
    return GUn
def preferential_attachment():
    GUn = transform_directed_to_undirected()
    AverageDegree = average_degree()
    Rnd = snap.TRnd()
    GPA = snap.GenPrefAttach(GUn.GetNodes(), int(AverageDegree), Rnd)
    snap.PrintInfo(GPA, "Tweets PA Stats", "Tweets_PA-info.txt", False)
    f = open('Tweets_PA-info.txt', 'r')
    file_contents = f.read()
    print(file_contents)
    f.close()
def erdos_renyi():
    GUn = transform_directed_to_undirected()
    # Erdos-Renyi random graph
    GER = snap.GenRndGnm(snap.PNGraph, G.GetNodes(), G.GetEdges())
    snap.PrintInfo(GER, "Tweets Random Stats", "Tweets_Random-info.txt", False)
    GUn.GetEdges()
    f = open('Tweets_Random-info.txt', 'r')
    file_contents = f.read()
    print(file_contents)
    f.close()
def biggest_connected_component_on_the_network():
    WccV = snap.TIntPrV()
    snap.GetWccSzCnt(G, WccV)
    con_comp = {}
    print("Connected components info.\n")
    print("# of connected component", WccV.Len())
    for comp in WccV:
        con_comp[comp.GetVal1()] = comp.GetVal2()
    print("Biggest connected component has size of:", max(con_comp.values()))
    snap.PrintInfo(G, "tweet Information", "tweet_stats_extended.txt", False)
def main():
    """
    See usage message in module header block
    """
    get_subgraph = False  # if True discard nodes without attribute data
    try:
        opts, args = getopt.getopt(sys.argv[1:], "d")
    except:
        usage(sys.argv[0])
    for opt, arg in opts:
        if opt == "-d":
            get_subgraph = True
        else:
            usage(sys.argv[0])

    if len(args) != 1:
        usage(sys.argv[0])

    data_dir = args[0]

    outputdir = '.'

    sys.stdout.write('loading data from ' + data_dir + '...')
    start = time.time()
    datazipfile = data_dir + os.path.sep + 'physician-shared-patient-patterns-2014-days30.zip'
    G = load_physician_referral_data(datazipfile)
    print time.time() - start, 's'

    snap.PrintInfo(G)

    # Remove loops (self-edges).
    # G is a PNGraph so multiple edges not allowed in this type anyway.
    snap.DelSelfEdges(G)
    snap.PrintInfo(G)

    # specify ordered nodelist to map sequential ids to original ids consistent
    nodelist = [node.GetId() for node in G.Nodes()]

    graph_filename = outputdir + os.path.sep + "physician_referall_arclist" + os.path.extsep + "txt"
    nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt"
    write_graph_file(graph_filename, G, nodelist)
    write_subgraph_nodeids(nodeid_filename, nodelist)
def node_rewiring():
    GUn = transform_directed_to_undirected()
    # Node Rewiring
    Rnd = snap.TRnd()
    GRW = snap.GenRewire(GUn, 1000, Rnd)
    snap.PrintInfo(GRW, "Tweets Rewire Stats", "Tweets_Rewire-info.txt", False)

    f = open('Tweets_Rewire-info.txt', 'r')
    file_contents = f.read()
    print(file_contents)
    f.close()
def configuration_model():
    GUn = transform_directed_to_undirected()
    GUnDegSeqV = snap.TIntV()
    snap.GetDegSeqV(GUn, GUnDegSeqV)

    Rnd = snap.TRnd()
    GConfModel = snap.GenConfModel(GUnDegSeqV, Rnd)
    snap.PrintInfo(GConfModel, "Tweets ConfModel Stats",
                   "Tweets_ConfModel-info.txt", False)
    f = open('Tweets_ConfModel-info.txt', 'r')
    file_contents = f.read()
    print(file_contents)
    f.close()
Beispiel #10
0
def get_props(g, gname, out_path="", fast=False, to_file=True):
    # get properties of a graph, e.g. density, connected components, diameter, etc.

    if to_file:
        desc_f = "%s/%s_desc.txt" % (out_path, gname)
    else:
        desc_f = "/dev/stdout"

    snap.PrintInfo(g, "description", desc_f, fast)

    all_deg = get_deg_dist(g)

    if to_file:
        deg_f = "%s/%s_deg_dist.csv" % (out_path, gname)
        all_deg.to_csv(deg_f, index=False)

    #else:
    return all_deg
def getGraphicalGraphInfo(graph_name, conn, cur):
    queryResult = QueryResult()
    
    graph_name = graph_name.strip()
    graph_path = helper.getGraph(graph_name) 

    cur.execute("select graphType from my_matgraphs where matgraphname = '%s';" % (graph_name))
    conn.commit()
    one_row = cur.fetchone()
    if one_row is None:
        queryResult.setType("string") 
        queryResult.setContent("Can't find this graph in my_matgraphs") 
    else:
        graph_type = one_row[0].strip()
        snap_graph_type = snap.PNGraph if graph_type == "digraph" else snap.PUNGraph
        graph = snap.LoadEdgeList(snap_graph_type, graph_path, 0, 1)
        tmpGraphDir = "/dev/shm/RG_Tmp_Graph/"
        tmpGraphInfoPath = tmpGraphDir + graph_name + '_info'
        #snap print info
        snap.PrintInfo(graph, "Graph Type", tmpGraphInfoPath)

        tableHeaderLst = ['Attribute', 'Value']
        rowsContent = []
        with open(tmpGraphInfoPath) as f:
            for line in f:
                fields = line.split(':')
                key = fields[0].strip()
                value = fields[1].strip()
                if key == "Graph Type":
                    value = 'Directed' if graph_type == "digraph" else "Undirected"
                rowsContent.append([key, value])

        #definition
        cur.execute("select definition from pg_matviews where matviewname = '%s';" % (graph_name))
        conn.commit()
        one_row = cur.fetchone()
        if one_row is not None:
            rowsContent.append(['Definition', one_row[0].strip()])

        queryResult.setType("table")
        queryResult.setContent(TableResult(tableHeaderLst, rowsContent))

    return queryResult
    def print_statistics(self, outfile_name):
        print 'Writing to file:', outfile_name
        snap.PrintInfo(self.Graph, 'Python type TUNGraph', outfile_name, False)

        with open(outfile_name, 'a') as f:
            f.write('\n####More information')

        max_degree_node = snap.GetMxDegNId(self.Graph)
        for artist_id in self.ids:
            if self.ids[artist_id] == max_degree_node:
                print artist_id

        # These may throw gnuplot errors; if so, edit the generated .plt files to correct the errors and run
        # gnuplot from terminal. (May need to set terminal to svg instead of png depending on your gnuplot
        # installation.)
        snap.PlotOutDegDistr(self.Graph, 'out_degree_distr',
                             'Out-degree distribution')
        snap.PlotInDegDistr(self.Graph, 'in_degree_distr',
                            'In-degree distribution')
def visualize_k_random_users(k, fanout, fanout_samples, graph):
    """
    OUTDATED
    @params: [k (int), fanout_samples (int), graph (snap.TUNGraph)]
    @returns: None

    Loads the snap.py graph from graph, and samples k edges from the network to
    visualize using networkx. Samples fanout_samples nodes to fanout, to
    prevent intractibly large sample graphs.
    """
    sample_graph = snap.GetRndESubGraph(graph, k)
    sample_graph = get_k_graph_egonet(
        fanout,
        fanout_samples,
        sample_graph,
        graph
    )
    snap.PrintInfo(
        sample_graph,
        'Sampled Graph Information',
        '/dev/stdout',
        False
    )
    nx_graph = nx.Graph()
    for node in sample_graph.Nodes():
        nx_graph.add_node(node.GetId())
    for edge in sample_graph.Edges():
        n1 = edge.GetSrcNId()
        n2 = edge.GetDstNId()
        nx_graph.add_edge(n1, n2)
    edges_list = [edge for edge in nx_graph.edges()]

    pos = nx.spring_layout(nx_graph)
    nx.draw_networkx_nodes(
        nx_graph,
        pos,
        node_color='b',
        node_size=10,
        alpha=0.6
    )
    nx.draw_networkx_edges(nx_graph, pos, edgelist=edges_list, arrows=False)
    plt.show()
def analyze_network(
    k=1000,
    fanout=1,
    fanout_samples=1,
    graph_in_path='bad_actors.graph'
):
    """
    @params: [k (int), graph_in_path (str)]
    @returns: None

    Loads a network from 'graph_in_path' and prints basic information about the
    network. Samples k edges from the network to visualize using networkx.
    """
    graph = snap.TNEANet.Load(snap.TFIn(graph_in_path))
    snap.PrintInfo(graph, 'Basic Graph Information', '/dev/stdout', False)
    MxScc = snap.GetMxScc(graph)
    print('Nodes in largest strongly-connected subcomponent: %d' %
        MxScc.GetNodes()
    )
    visualize_k_random_users(k, fanout, fanout_samples, graph)
Beispiel #15
0
def main(args):
    ub_review_edges_file = args.ub_review_edges
    graph_info_file = args.graph_info

    # load graph
    G = snap.LoadEdgeList(snap.PUNGraph, ub_review_edges_file, 0, 1)

    # graph info
    snap.PrintInfo(G, "yelp-review-stats", graph_info_file, False)

    # plift
    plfitDegreeDistr(G, graph_info_file)
    
    # clustering coefficient
    clustCf(G, graph_info_file)
    
    # wcc
    topWCC(G, graph_info_file)

    # number of users, businesses, reviews
    load_graph(ub_review_edges_file, graph_info_file)
import snap

G = snap.LoadEdgeList(snap.PNGraph, 'wiki-Vote.txt', 0, 1)

print " 1 ----------"
snap.PrintInfo(G, "a")
print " 2 ----------"
snap.PrintInfo(G, "a", "file.txt")
print " 3 ----------"
snap.PrintInfo(G, "a", "file.txt", False)

print " 4 ----------"
snap.PrintInfo(G, "a", "")

#print " 5 ----------"
#snap.PrintInfo(G, "a", "", False)
def main(argv):
    argv.pop(0)
    directory = argv.pop(0)
    directoryReviews = argv.pop(0)
    directoryItems = argv.pop(0)
    item = argv.pop(0)
    goodRating = int(argv.pop(0))
    yearList = list(argv)

    inFiles = [
        f for f in listdir(directoryReviews)
        if isfile(join(directoryReviews, f))
    ]

    fileList = []

    for f in inFiles:
        for y in yearList:
            if y in f:
                fileList.append(directoryReviews + f)

    with open(directoryReviews + 'reviews_' + item + '_combined.json',
              'w') as outfile:
        for fname in fileList:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)

    with open(directoryReviews + 'reviews_' + item + '_combined.json',
              'rb') as f_in, gzip.open(
                  directoryReviews + 'reviews_' + item + '_combined.json.gz',
                  'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    # Parsing Items
    parseItems(directoryItems + 'meta_' + item + '.json.gz', directory)

    snap.PrintInfo(GItems, 'GItems Information')

    # Saving GItems
    snap.SaveEdgeList(GItems, directory + 'Edge_List_Items_' + item + '.txt')

    with open(directory + 'Dictionary_Items_' + item + '.txt', 'w') as f1:
        json.dump(asinItems, f1)

    with open(directory + 'Dictionary_Items_Nodes_' + item + '.txt',
              'w') as f3:
        json.dump(nodeItems, f3)

    userItemsFileName = directory + '_User_Item_' + item + '.txt'

    # Parsing Reviews
    parseReviews(directoryReviews + 'reviews_' + item + '_combined.json.gz',
                 goodRating, userItemsFileName, directory)

    snap.PrintInfo(GUsers, 'GUsers Information')

    # Saving GUsers
    snap.SaveEdgeList(GUsers, directory + 'Edge_List_Users_' + item + '.txt')

    with open(directory + 'Dictionary_Users_' + item + '.txt', 'w') as f2:
        json.dump(reviewerIdUsers, f2)

    snap.PrintInfo(GCombined, 'GCombined Information')

    snap.SaveEdgeList(GCombined,
                      directory + 'Edge_List_Combined_' + item + '.txt')
def compute_graph_statistics(graph_path, overwrite, compute_betweenness=False):
    graph_abs_path = os.path.abspath(graph_path)
    graph_name = os.path.basename(graph_abs_path).replace(".graph", "")
    fin = snap.TFIn(graph_abs_path)
    graph = snap.TNEANet.Load(fin)

    # rebuild the id => pkg dictionary
    id_pkg_dict = {}
    for node in graph.Nodes():
        id_pkg_dict[node.GetId()] = graph.GetStrAttrDatN(node.GetId(), "pkg")
    directory = os.path.dirname(os.path.abspath(graph_path))
    json_path = os.path.join(directory, graph_name + "_statistics.json")
    if os.path.isfile(json_path):
        with open(json_path, "r") as f:
            statistics = json.load(f, object_pairs_hook=OrderedDict)
    else:
        statistics = OrderedDict()

    # snap.py doesn't suport absolute paths for some operations. Let's cd to the directory
    os.chdir(directory)

    # general statistics
    output = os.path.join(directory, graph_name + "_main_statistics.txt")
    if not os.path.isfile(output) or overwrite:
        print("{0} Computing general statistics".format(datetime.datetime.now()))
        snap.PrintInfo(graph, "Play Store Graph -- main statistics", output, False)

    # info about the nodes with the max in degree
    if "max_in_degree" not in statistics or overwrite:
        print("{0} Computing max indegree".format(datetime.datetime.now()))
        max_in_deg_id = snap.GetMxInDegNId(graph)
        iterator = graph.GetNI(max_in_deg_id)
        max_in_deg = iterator.GetInDeg()
        max_in_deg_pkg = graph.GetStrAttrDatN(max_in_deg_id, "pkg")
        statistics["max_in_degree"] = max_in_deg
        statistics["max_in_degree_id"] = max_in_deg_id
        statistics["max_in_degree_pkg"] = max_in_deg_pkg

    # info about the nodes with the max out degree
    if "max_out_degree" not in statistics or overwrite:
        print("{0} Computing max outdegree".format(datetime.datetime.now()))
        max_out_deg_id = snap.GetMxOutDegNId(graph)
        iterator = graph.GetNI(max_out_deg_id)
        max_out_deg = iterator.GetOutDeg()
        max_out_deg_pkg = graph.GetStrAttrDatN(max_out_deg_id, "pkg")
        statistics["max_out_degree"] = max_out_deg
        statistics["max_out_degree_id"] = max_out_deg_id
        statistics["max_out_degree_pkg"] = max_out_deg_pkg

    # pagerank statistics
    output = graph_name + "_topNpagerank.eps"
    if not os.path.isfile(output) or "top_n_pagerank" not in statistics or overwrite:
        print("{0} Computing top 20 nodes with highest pagerank".format(datetime.datetime.now()))
        data_file = graph_name + "_pageranks"
        prank_hashtable = snap.TIntFltH()
        if not os.path.isfile(data_file) or overwrite:
            # Damping Factor: 0.85, Convergence difference: 1e-4, MaxIter: 100
            snap.GetPageRank(graph, prank_hashtable, 0.85)
            fout = snap.TFOut(data_file)
            prank_hashtable.Save(fout)
        else:
            fin = snap.TFIn(data_file)
            prank_hashtable.Load(fin)

        top_n = get_top_nodes_from_hashtable(prank_hashtable)
        top_n.sort(key=itemgetter(1))
        if "top_n_pagerank" not in statistics or overwrite:
            top_n_labeled = []
            for pair in top_n:
                top_n_labeled.append((id_pkg_dict[pair[0]], pair[1]))
            statistics["top_n_pagerank"] = list(reversed(top_n_labeled))

        if not os.path.isfile(output) or overwrite:
            # let's build a subgraph induced on the top 20 pagerank nodes
            subgraph = get_subgraph(graph, [x[0] for x in top_n])
            labels_dict = get_labels_subset(id_pkg_dict, subgraph)
            values = snap_hashtable_to_dict(prank_hashtable, [x[0] for x in top_n])
            plot_subgraph_colored(subgraph, labels_dict, values, "PageRank",
                                  "Play Store Graph - top 20 PageRank nodes", output, "autumn_r")

    # betweeness statistics
    output = graph_name + "_topNbetweenness.eps"
    if compute_betweenness and (not os.path.isfile(output) or "betweenness" not in statistics or overwrite):
        print("{0} Computing top 20 nodes with highest betweenness".format(datetime.datetime.now()))
        data_file1 = graph_name + "_node_betweenness"
        data_file2 = graph_name + "_edge_betweenness"
        node_betwenness_hashtable = snap.TIntFltH()
        edge_betwenness_hashtable = snap.TIntPrFltH()
        if not os.path.isfile(data_file1) or not os.path.isfile(data_file2) or overwrite:
            snap.GetBetweennessCentr(graph, node_betwenness_hashtable, edge_betwenness_hashtable, 0.85, True)
            fout = snap.TFOut(data_file1)
            node_betwenness_hashtable.Save(fout)
            fout = snap.TFOut(data_file2)
            edge_betwenness_hashtable.Save(fout)

        else:
            fin = snap.TFIn(data_file1)
            node_betwenness_hashtable.Load(fin)
            fin = snap.TFIn(data_file2)
            edge_betwenness_hashtable.Load(fin)  # unused, as now

        top_n = get_top_nodes_from_hashtable(node_betwenness_hashtable)
        top_n.sort(key=itemgetter(1))
        if "top_n_betweenness" not in statistics or overwrite:
            top_n_labeled = []
            for pair in top_n:
                top_n_labeled.append((id_pkg_dict[pair[0]], pair[1]))
            statistics["top_n_betweenness"] = list(reversed(top_n_labeled))

        if not os.path.isfile(output) or overwrite:
            # let's build a subgraph induced on the top 20 betweenness nodes
            subgraph = get_subgraph(graph, [x[0] for x in top_n])
            labels_dict = get_labels_subset(id_pkg_dict, subgraph)
            values = snap_hashtable_to_dict(node_betwenness_hashtable, [x[0] for x in top_n])
            plot_subgraph_colored(subgraph, labels_dict, values, "Betweenness",
                                  "Play Store Graph - top 20 Betweenness nodes", output)

    # HITS statistics
    output_hub = graph_name + "_topNhitshubs.eps"
    output_auth = graph_name + "_topNhitsauth.eps"
    if not os.path.isfile(output_hub) or not os.path.isfile(output_auth) or "top_n_hits_hubs" not in statistics \
            or "top_n_hits_authorities" not in statistics or overwrite:
        print("{0} Computing top 20 HITS hubs and auths".format(datetime.datetime.now()))
        data_file1 = graph_name + "_hits_hubs"
        data_file2 = graph_name + "_hits_auth"
        hubs_hashtable = snap.TIntFltH()
        auth_hashtable = snap.TIntFltH()
        if not os.path.isfile(data_file1) or not os.path.isfile(data_file2) or overwrite:
            # MaxIter = 20
            snap.GetHits(graph, hubs_hashtable, auth_hashtable, 20)
            fout = snap.TFOut(data_file1)
            hubs_hashtable.Save(fout)
            fout = snap.TFOut(data_file2)
            auth_hashtable.Save(fout)

        else:
            fin = snap.TFIn(data_file1)
            hubs_hashtable.Load(fin)
            fin = snap.TFIn(data_file2)
            auth_hashtable.Load(fin)

        top_n_hubs = get_top_nodes_from_hashtable(hubs_hashtable)
        top_n_hubs.sort(key=itemgetter(1))
        if "top_n_hits_hubs" not in statistics or overwrite:
            top_n_labeled = []
            for pair in top_n_hubs:
                top_n_labeled.append((id_pkg_dict[pair[0]], pair[1]))
            statistics["top_n_hits_hubs"] = list(reversed(top_n_labeled))

        top_n_auth = get_top_nodes_from_hashtable(auth_hashtable)
        top_n_auth.sort(key=itemgetter(1))
        if "top_n_hits_authorities" not in statistics or overwrite:
            top_n_labeled = []
            for pair in top_n_auth:
                top_n_labeled.append((id_pkg_dict[pair[0]], pair[1]))
            statistics["top_n_hits_authorities"] = list(reversed(top_n_labeled))

        if not os.path.isfile(output_hub) or not os.path.isfile(output_auth) or overwrite:
            nodes_subset = set()
            for pair in top_n_hubs:
                nodes_subset.add(pair[0])
            for pair in top_n_auth:
                nodes_subset.add(pair[0])

            # let's build a subgraph induced on the top N HITS auths and hubs nodes
            subgraph = get_subgraph(graph, nodes_subset)
            labels_dict = get_labels_subset(id_pkg_dict, subgraph)
            values = snap_hashtable_to_dict(hubs_hashtable, nodes_subset)
            values2 = snap_hashtable_to_dict(auth_hashtable, nodes_subset)
            plot_subgraph_colored(subgraph, labels_dict, values, "HITS - Hub Index",
                                  "Play Store Graph - top 20 HITS hubs + top 20 HITS authorities", output_hub, "bwr")
            plot_subgraph_colored(subgraph, labels_dict, values2, "HITS - Authority Index",
                                  "Play Store Graph - top 20 HITS hubs + top 20 HITS authorities", output_auth,
                                  "bwr_r")

    # indegree histogram
    output = graph_name + "_indegree"
    if not os.path.isfile("inDeg." + output + ".plt") or not os.path.isfile(
                            "inDeg." + output + ".tab") or not os.path.isfile("inDeg." + output + ".png") or overwrite:
        print("{0} Computing indegree distribution".format(datetime.datetime.now()))
        snap.PlotInDegDistr(graph, output, "Play Store Graph - in-degree Distribution")

    # outdegree histogram
    output = graph_name + "_outdegree"
    if not os.path.isfile("outDeg." + output + ".plt") or not os.path.isfile(
                            "outDeg." + output + ".tab") or not os.path.isfile(
                        "outDeg." + output + ".png") or overwrite:
        print("{0} Computing outdegree distribution".format(datetime.datetime.now()))
        snap.PlotOutDegDistr(graph, output, "Play Store Graph - out-degree Distribution")

    # strongly connected components print
    output = graph_name + "_scc"
    if not os.path.isfile("scc." + output + ".plt") or not os.path.isfile(
                            "scc." + output + ".tab") or not os.path.isfile("scc." + output + ".png") or overwrite:
        print("{0} Computing scc distribution".format(datetime.datetime.now()))
        snap.PlotSccDistr(graph, output, "Play Store Graph - strongly connected components distribution")

    # weakly connected components print
    output = graph_name + "_wcc"
    if not os.path.isfile("wcc." + output + ".plt") or not os.path.isfile(
                            "wcc." + output + ".tab") or not os.path.isfile("wcc." + output + ".png") or overwrite:
        print("{0} Computing wcc distribution".format(datetime.datetime.now()))
        snap.PlotWccDistr(graph, output, "Play Store Graph - weakly connected components distribution")

    # clustering coefficient distribution
    output = graph_name + "_cf"
    if not os.path.isfile("ccf." + output + ".plt") or not os.path.isfile(
                            "ccf." + output + ".tab") or not os.path.isfile("ccf." + output + ".png") or overwrite:
        print("{0} Computing cf distribution".format(datetime.datetime.now()))
        snap.PlotClustCf(graph, output, "Play Store Graph - clustering coefficient distribution")

    # shortest path distribution
    output = graph_name + "_hops"
    if not os.path.isfile("hop." + output + ".plt") or not os.path.isfile(
                            "hop." + output + ".tab") or not os.path.isfile("hop." + output + ".png") or overwrite:
        print("{0} Computing shortest path distribution".format(datetime.datetime.now()))
        snap.PlotHops(graph, output, "Play Store Graph - Cumulative Shortest Paths (hops) distribution", True)

    # k-core edges distribution
    output = graph_name + "_kcore_edges"
    if not os.path.isfile("coreEdges." + output + ".plt") or not os.path.isfile(
                            "coreEdges." + output + ".tab") or not os.path.isfile(
                        "coreEdges." + output + ".png") or overwrite:
        print("{0} Computing k-core edges distribution".format(datetime.datetime.now()))
        snap.PlotKCoreEdges(graph, output, "Play Store Graph - K-Core edges distribution")

    # k-core nodes distribution
    output = graph_name + "_kcore_nodes"
    if not os.path.isfile("coreNodes." + output + ".plt") or not os.path.isfile(
                            "coreNodes." + output + ".tab") or not os.path.isfile(
                        "coreNodes." + output + ".png") or overwrite:
        print("{0} Computing k-core nodes distribution".format(datetime.datetime.now()))
        snap.PlotKCoreNodes(graph, output, "Play Store Graph - K-Core nodes distribution")

    with open(json_path, 'w') as outfile:
        json.dump(statistics, outfile, indent=2)
Beispiel #19
0
from __future__ import division
"""Run some exploratory analysis on Twitter replies network.
Some of the code is adapted from the snap.py tutorial."""

import snap
from twython import Twython
import sys

if __name__ == '__main__':
    CONSUMER_KEY, CONSUMER_SECRET = open('twitapikeys.txt').read().split()[:2]
    twitterapi = Twython(CONSUMER_KEY, CONSUMER_SECRET)

    filename = sys.argv[1]
    repliesgraph = snap.LoadEdgeList(snap.PNGraph, filename, 0, 1)
    snap.PrintInfo(repliesgraph, "Twitter replies network")
    print

    #reciprocity
    num_dir_edges = snap.CntUniqDirEdges(repliesgraph)
    print "{0:.2f}% of directed edges are reciprocal".format(
        snap.CntUniqBiDirEdges(repliesgraph) * 2 * 100 / num_dir_edges)

    #clustering coefficient
    print "The clustering coefficient is {0:.2f}%".format(
        snap.GetClustCf(repliesgraph) * 100)

    #strongly and weakly connected components
    CntV = snap.TIntPrV()
    snap.GetSccSzCnt(repliesgraph, CntV)
    num_cc = 0
    for p in CntV:
Beispiel #20
0
import snap

G = snap.LoadEdgeList(snap.PNGraph, "Wiki-Vote.txt", 0, 1)
snap.PrintInfo(G, "votes Stats", "votes-info.txt", False)

# Node ID with maximum degree
NId1 = snap.GetMxDegNId(G)
print("Node ID with Maximum-Degree: %d" % NId1)

# Number of Strongly connected components
ComponentDist = snap.TIntPrV()
snap.GetSccSzCnt(G, ComponentDist)
for comp in ComponentDist:
    print("Size: %d - Number of Components: %d" %
          (comp.GetVal1(), comp.GetVal2()))

# Size of largest strongly connected component
print("Strongly Connected Component - Maximum size:", snap.GetMxSccSz(G))

# Number of Weakly Connected Components
CompDist = snap.TIntPrV()
snap.GetWccSzCnt(G, CompDist)
for comp in CompDist:
    print("Size: %d - Number of Components: %d" %
          (comp.GetVal1(), comp.GetVal2()))

# Size of largest weakly connected component
print("Weakly Connected Component - Maximum size:", snap.GetMxWccSz(G))

# Plot of Outdegree Distribution
snap.PlotOutDegDistr(G, "Wiki Votes", "Wiki-Votes Out Degree")
def get_graph_info(file_path, output_path):
    Graph, H = load_graph(file_path)
    snap.PrintInfo(Graph, 'Python type PNGraph', output_path, False)
def main():
    """
    See usage message in module header block
    """
    directed = True
    try:
        opts, args = getopt.getopt(sys.argv[1:], "")
    except:
        usage(sys.argv[0])
    for opt, arg in opts:
        usage(sys.argv[0])

    if len(args) != 5:
        usage(sys.argv[0])

    data_dir = args[0]
    num_samples = int(args[1])
    num_seeds = int(args[2])
    num_waves = int(args[3]) - 1  # -1 for consistency with SPNet
    outputdir = args[4]

    print "directed:", directed
    print "number of samples:", num_samples
    print "number of seeds:", num_seeds
    print "number of waves:", num_waves
    print "output directory:", outputdir

    if not os.path.exists(outputdir):
        os.mkdir(outputdir)

    sys.stdout.write('loading data from ' + data_dir + '...')
    start = time.time()
    datazipfile = data_dir + os.path.sep + 'physician-shared-patient-patterns-2014-days30.zip'
    G = load_physician_referral_data(datazipfile)
    print time.time() - start, 's'

    snap.PrintInfo(G)

    # get num_samples * num_seeds distinct random seed nodes (sample without replacement)
    # and convert to list of lists where each list is seed set for one sample
    allseeds = random.sample([node.GetId() for node in G.Nodes()],
                             num_samples * num_seeds)
    seedsets = [
        allseeds[i:i + num_seeds] for i in range(0, len(allseeds), num_seeds)
    ]

    sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt"
    sampledesc_f = open(sampledesc_filename, 'w')

    for i in range(num_samples):
        sys.stdout.write('generating snowball sample ' + str(i + 1) + '... ')
        start = time.time()
        # have to convert seedset to TIntV for SNAP
        seedsVec = snap.TIntV()
        for nodeid in seedsets[i]:
            seedsVec.Add(nodeid)
        Gsample0 = snowball_sample(G, num_waves, seedsVec)
        #print 'XXX',Gsample0.GetIntAttrDatN(Gsample0.GetRndNId(), "zone")#XXX
        # renumber nodes so they are numbered 0..N-1
        # Actually can't do this as it loses the node attributes (zone)
        # so instead build a dictionary mapping nodeid:zone
        # so that can be written to zone file in correct order.
        # Note that then the index in nodelist of a nodeid can be used
        # as sequential node number of each node.
        ##Gsample = snap.ConvertGraph(snap.PNEANet, Gsample0, True)
        #print 'YYY',Gsample.GetIntAttrDatN(Gsample.GetRndNId(), "zone")#XXX
        Gsample = Gsample0
        nodelist = list(
        )  # keep this iteration in list so we always use same order in future
        zonedict = dict()  # map nodeid : zone
        for node in Gsample.Nodes():
            nodelist.append(node.GetId())
            zonedict[node.GetId()] = Gsample.GetIntAttrDatN(
                node.GetId(), "zone")
        print time.time() - start, 's'

        snap.PrintInfo(Gsample)
        subgraph_filename = outputdir + os.path.sep + "subgraph" + str(
            i) + os.path.extsep + "txt"
        write_graph_file(subgraph_filename, Gsample, nodelist)
        subzone_filename = outputdir + os.path.sep + "subzone" + str(
            i) + os.path.extsep + "txt"
        write_zone_file(subzone_filename, Gsample, nodelist, zonedict)
        subactor_filename = outputdir + os.path.sep + "subactor" + str(
            i) + os.path.extsep + "txt"
        # TODO get actor attributes
        #write_subactors_file(subactor_filename, Gsample, nodelist)

        # format of sampledesc file is:
        # N subzone_filename subgraph_filename subactor_filename
        sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt"
        sampledesc_f.write("%d %s %s %s\n" %
                           (Gsample.GetNodes(), subzone_filename,
                            subgraph_filename, subactor_filename))

    sampledesc_f.close()
Beispiel #23
0
import snap
import time

#from utils.network_utils import get_num_elem_per_mode

filename = "Graphs/oldMinerNewSNAP.graph"
FIn = snap.TFIn(filename)
Graph = snap.TMMNet.Load(FIn)

print('Modes: %d' % Graph.GetModeNets())
print('Link types: %d' % Graph.GetCrossNets())

crossnetids = snap.TInt64V()
crossneti = Graph.BegCrossNetI()
while crossneti < Graph.EndCrossNetI():
    crossnetids.Add(crossneti.GetCrossId())
    crossneti.Next()

nodeattrmapping = snap.TIntStrStrTr64V()
edgeattrmapping = snap.TIntStrStrTr64V()
start_time = time.time()
DirectedNetwork = Graph.ToNetwork(crossnetids, nodeattrmapping,
                                  edgeattrmapping)
end_time = time.time()
print("Converting to TNEANet  takes %s seconds" % (end_time - start_time))

snap.PrintInfo(DirectedNetwork, "Python type PNEANet", "output.txt", False)
map(lambda x: x.replace("\n", ""), open("output.txt").readlines())
# print 'Loading Boards'
# boardfile = open('../data/boards.tsv')
# board2user  = {}
# user2boards = defaultdict(list)
# for line in boardfile:
#     board_id, board_name, board_description, user_id, board_create_time = line.split('\t')
#     board2user[board_id] = (user_id,board_name)
#     user2boards[int(user_id)].append((board_id,board_name))


graph = snap.LoadEdgeList(snap.PNGraph, '../graphs/firstMillionGraph.txt', 0, 1)

print "This Graph has ",graph.GetNodes(), " nodes"
print "This Graph has ",graph.GetEdges(), " edges"

snap.PrintInfo(graph, "Python type PNEANet")

n = graph.GetNI(snap.GetMxInDegNId(graph))

print 'Max in degree node:', n.GetId()
print 'In degree: ',n.GetInDeg()
print 'Out degree:',n.GetOutDeg()

# print user2boards[int(n.GetId())
print 'Calculating Page Rank'
PRankH = snap.TIntFltH()
snap.GetPageRank(graph, PRankH)

betw = []
for n in PRankH:
    betw.append((n,PRankH[n]))
Beispiel #25
0
import snap
Graph = snap.LoadEdgeList(snap.PUNGraph, "facebook_combined.txt", 0, 1, ' ')
snap.PrintInfo(Graph, "Facebook Data Set")
SubGraph = snap.GetRndSubGraph(Graph, 10)
SubGraph.Dump()
import snap
import matplotlib.pyplot as plt
import numpy as np

from pathlib import Path
import sys

gfile = sys.argv[1]
print('Printing summary stats for file at:', gfile)

if gfile.endswith('.graph'):
    FIn = snap.TFIn(gfile)
    Network = snap.TUNGraph.Load(FIn)
else:
    Network = snap.LoadEdgeList(snap.PUNGraph, gfile, 0, 1)

snap.PrintInfo(Network)
print('Edges:', snap.CntUniqUndirEdges(Network))

# for directed graphs, should be same for undir
DegToCntV = snap.TIntPrV()
snap.GetInDegCnt(Network, DegToCntV)
print('Nodes with deg > 10',
      sum([item.GetVal2() for item in DegToCntV if item.GetVal1() > 10]))

ClustCoeff = snap.GetClustCf(Network, 10000)
print('Clustering coeff', ClustCoeff)
def main():
    """
    See usage message in module header block
    """
    get_subgraph = False  # if True discard nodes without attribute data
    try:
        opts, args = getopt.getopt(sys.argv[1:], "d")
    except:
        usage(sys.argv[0])
    for opt, arg in opts:
        if opt == "-d":
            get_subgraph = True
        else:
            usage(sys.argv[0])

    if len(args) != 1:
        usage(sys.argv[0])

    data_dir = args[0]

    outputdir = '.'

    sys.stdout.write('loading data from ' + data_dir + '...')
    start = time.time()
    (G, patdata, colnames) = load_nber_patent_data(data_dir)
    print time.time() - start, 's'

    snap.PrintInfo(G)

    # Remove loops (self-edges).
    # There is actually for some reason one loop (patent id 5489070).
    # G is a PNGraph so multiple edges not allowed in this type anyway.
    snap.DelSelfEdges(G)
    snap.PrintInfo(G)

    # We do not add attributes to nodes as SNAP node attribute as
    # these seem to get lost by varoius operations including subgraph
    # that we need to use, so instead maintain them just in the
    # dictionary mapping the original node ids to the attributes -
    # fortunately the original node ids are maintained by
    # GetSubGraph() so we can used these to index the patdata
    # dictoinary in the subgraphs

    # Cannot do this:
    #patdata[:][colnames['COUNTRY']] = convert_to_int_cat(patdata[:][colnames['COUNTRY']]) # like factor in R
    # as get "TypeError: unhashable type" so have to do this instead:
    id_countries = [(k, p[colnames['COUNTRY']])
                    for (k, p) in patdata.iteritems()]
    id_countries_int = convert_to_int_cat([x[1] for x in id_countries])
    for i in xrange(len(id_countries)):
        patdata[id_countries[i][0]][colnames['COUNTRY']] = id_countries_int[i]
    for attr in ['COUNTRY']:
        sys.stdout.write('There are %d NA for %s\n' %
                         ([p[colnames[attr]]
                           for p in patdata.itervalues()].count('NA'), attr))

    id_states = [(k, p[colnames['POSTATE']]) for (k, p) in patdata.iteritems()]
    id_states_int = convert_to_int_cat([x[1] for x in id_states])
    for i in xrange(len(id_states)):
        patdata[id_states[i][0]][colnames['POSTATE']] = id_states_int[i]
    for attr in ['POSTATE']:
        sys.stdout.write('There are %d NA for %s\n' %
                         ([p[colnames[attr]]
                           for p in patdata.itervalues()].count('NA'), attr))

    # There are 3774768 unique patent identifiers in the citation data but
    # only 2923922 unique patent identifiers in the patent data (patdata).
    # The size of the set intersection of these patent ids is 2755865
    # i.e. there is patent data for 73% of the patents in the citation network.
    # Presumably this is because the patdata (pat63_99.txt) contains all
    # utilit patents in the period 1963 to 1999 but the citation data
    # cit75_99.txt contains all US patent citations for utility patents
    # granted in the period 1975 to 1999, so there are patent ids in here
    # from earlier periods that are cited by patents in that period,
    # for which therefore we don't have the patent data (prior to 1963).
    # So we have to set the data for all patents in network that we have it
    # for, and the rest (27%) to NA.

    nodelist = list(
    )  # keep the iteration below in list so we always use same order in future

    if get_subgraph:
        # get subgraph induced by nodes that have patent data in the
        # pat63_99.txt file
        nodeVec = snap.TIntV()  # nodelist in TIntV format for use in SNAP
        for node in G.Nodes():
            patid = node.GetId()
            if patdata.has_key(patid):
                nodelist.append(patid)
                nodeVec.Add(patid)
        G = snap.GetSubGraph(G, nodeVec)
        print 'Subgraph with only nodes with patent attribute data:'
        snap.PrintInfo(G)
    else:
        # keep all the graph and just put NA for all data attributes
        citepatent_count = 0
        patentdata_count = 0
        for node in G.Nodes():
            citepatent_count += 1
            patid = node.GetId()
            nodelist.append(patid)
            #print citepatent_count, patentdata_count, patid  #XXX
            if not patdata.has_key(patid):
                #print 'NA for ', patid #XXX
                patdata[patid] = len(colnames) * ["NA"]
                patdata[patid][
                    colnames['HASDATA']] = 0  # no data on this patent
            else:
                patentdata_count += 1
        sys.stdout.write(
            "There are %d unique cited/citing patents of which %d (%f%%) have patent data\n"
            % (citepatent_count, patentdata_count,
               100 * float(patentdata_count) / citepatent_count))

    graph_filename = outputdir + os.path.sep + "patent_citations" + os.path.extsep + "txt"
    write_graph_file(graph_filename, G, nodelist)
    attributes_binary_filename = outputdir + os.path.sep + "patent_binattr" + os.path.extsep + "txt"
    attributes_categorical_filename = outputdir + os.path.sep + "patent_catattr" + os.path.extsep + "txt"
    attributes_continuous_filename = outputdir + os.path.sep + "patent_contattr" + os.path.extsep + "txt"

    write_attributes_file_binary(attributes_binary_filename, G, nodelist,
                                 patdata, colnames)
    write_attributes_file_categorical(attributes_categorical_filename, G,
                                      nodelist, patdata, colnames)
    write_attributes_file_continuous(attributes_continuous_filename, G,
                                     nodelist, patdata, colnames)

    nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt"
    write_subgraph_nodeids(nodeid_filename, nodelist)
Beispiel #28
0
    metadata = {}
    metadata['number_of_patents'] = len(patents)  # Number of primary patents

    Graph = snap.PUNGraph.New()

    # Merge list of patents from this company and external patents they cite
    citation_map = {}
    patent_set = set()
    for patent in patents:
        patent_set.add(patent)  # This patent
        patent_set.update(citation_cache[patent])  # This patent's citations
        citation_map[patent] = citation_cache[patent]

    patent_nid_map = {}
    # Add all nodes to graph
    for i, patent in enumerate(patent_set):
        patent_nid_map[patent] = i
        Graph.AddNode(i)

    # Add all backward citation edges
    for patent, citations in citation_map.iteritems():
        for cite in citations:
            Graph.AddEdge(patent_nid_map[patent], patent_nid_map[cite])

    snap.PrintInfo(Graph)
    with open(out_folder + '%s.json' % company_name, 'w') as fp:
        json.dump(metadata, fp, sort_keys=True, indent=4)
    snap.SaveEdgeList(Graph, out_folder + '{}.txt'.format(company_name), \
                        "Backward citation network for company, drawn from patent data")
    print "Saved data for {}".format(company_name)
            '\n')
    f.close()
    print 'finished writing pagerank components values'


def component_distribution(g):
    print 'executing component distribution --- getting components'
    ComponentDist = snap.TIntPrV()
    snap.GetWccSzCnt(g, ComponentDist)
    f = open('component_distribution.txt', 'w')
    f.write("Size  - Number of Components:\n")
    for comp in ComponentDist:
        f.write("% d \t % d\n" % (comp.GetVal1(), comp.GetVal2()))
    f.close()
    print 'finshed componet distribution'


snap.PrintInfo(g, "Python type PNGraph", "info-pngraph.txt", False)

pr = Process(target=pagerank, args=(g, ))
cd = Process(target=component_distribution, args=(g, ))
prc = Process(target=pageRank_components, args=(g, ))

pr.start()
cd.start()
prc.start()

pr.join()
cd.join()
prc.join()
Beispiel #30
0
 def printGStats(self):
     snap.PrintInfo(self.G, self.graphName,
                    statDir + self.graphName + "-info.txt", False)