Beispiel #1
0
def test_star_graph():
    G = nx.star_graph(3)
    # all modes are the same
    answer = {0: 0, 1: 1, 2: 1, 3: 1}
    assert bipartite.clustering(G, mode="dot") == answer
    assert bipartite.clustering(G, mode="min") == answer
    assert bipartite.clustering(G, mode="max") == answer
Beispiel #2
0
def test_path_graph():
    G = nx.path_graph(4)
    answer = {0: 0.5, 1: 0.5, 2: 0.5, 3: 0.5}
    assert_equal(bipartite.clustering(G, mode='dot'), answer)
    assert_equal(bipartite.clustering(G, mode='max'), answer)
    answer = {0: 1, 1: 1, 2: 1, 3: 1}
    assert_equal(bipartite.clustering(G, mode='min'), answer)
def test_star_graph():
    G = nx.star_graph(3)
    # all modes are the same
    answer = {0: 0, 1: 1, 2: 1, 3: 1}
    assert_equal(bipartite.clustering(G, mode='dot'), answer)
    assert_equal(bipartite.clustering(G, mode='min'), answer)
    assert_equal(bipartite.clustering(G, mode='max'), answer)
Beispiel #4
0
def test_star_graph():
    G = nx.star_graph(3)
    # all modes are the same
    answer = {0: 0, 1: 1, 2: 1, 3: 1}
    assert_equal(bipartite.clustering(G, mode='dot'), answer)
    assert_equal(bipartite.clustering(G, mode='min'), answer)
    assert_equal(bipartite.clustering(G, mode='max'), answer)
def test_path_graph():
    G = nx.path_graph(4)
    answer = {0: 0.5, 1: 0.5, 2: 0.5, 3: 0.5}
    assert_equal(bipartite.clustering(G, mode='dot'), answer)
    assert_equal(bipartite.clustering(G, mode='max'), answer)
    answer = {0: 1, 1: 1, 2: 1, 3: 1}
    assert_equal(bipartite.clustering(G, mode='min'), answer)
Beispiel #6
0
def test_path_graph():
    G = nx.path_graph(4)
    answer = {0: 0.5, 1: 0.5, 2: 0.5, 3: 0.5}
    assert bipartite.clustering(G, mode="dot") == answer
    assert bipartite.clustering(G, mode="max") == answer
    answer = {0: 1, 1: 1, 2: 1, 3: 1}
    assert bipartite.clustering(G, mode="min") == answer
def analysis(graph, seed, calc_nrd=True, calc_ncc=True, calc_depth=True):
    """
    Computes and returns a number of statistics on the graph
    """

    logging.info('Computing Statistics')

    depth, nrd, ncc = {}, {}, {}
    if calc_depth:
        depth = _bfs_depth(graph, seed)
    if calc_nrd:
        nrd = bipartite.node_redundancy(graph)
    if calc_ncc:
        ncc = bipartite.clustering(graph, mode='min')

    for id in graph.nodes():
        node = graph.node[id]
        if calc_depth:
            node['depth'] = depth[id]
        if calc_nrd:
            node['nrd'] = nrd[id]
        if calc_ncc:
            node['ncc'] = ncc[id]
        graph.node[id] = node

    return graph
def analysis(graph, seed, calc_nrd=True, calc_ncc=True, calc_depth=True):
    """
    Computes and returns a number of statistics on the graph
    """

    logging.info('Computing Statistics')

    depth, nrd, ncc = {}, {}, {}
    if calc_depth:
        depth = _bfs_depth(graph, seed)
    if calc_nrd:
        nrd = bipartite.node_redundancy(graph)
    if calc_ncc:
        ncc = bipartite.clustering(graph, mode='min')

    for id in graph.nodes():
        node = graph.node[id]
        if calc_depth:
            node['depth'] = depth[id]
        if calc_nrd:
            node['nrd'] = nrd[id]
        if calc_ncc:
            node['ncc'] = ncc[id]
        graph.node[id] = node

    return graph
def calculate_centrality(fp, centrality_type, perm_maps):
    print '%s : start to read %s.txt '%(centrality_type, fp)
    g = nx.Graph()
    i_t = 100000
    i_i = 0
    p = 0
    f = codecs.open('./txt_critical_perms/apps_file/%s.txt'%(fp), 'r', encoding='utf-8')
    l = f.readline()
    l = f.readline()
    while l:
        p, i_i = p_percent(p, i_i, i_t, 10)
        ls = l.split('\t')
        app_id = ls[0].strip().lower()
        perm_id = ls[1].strip().lower()
        g.add_node(app_id, bipartite=0) # top
        g.add_node(perm_id, bipartite=1) # buttom
        g.add_edge(app_id, perm_id)
        l = f.readline()
    is_connect = nx.is_connected(g)
    print u'end read: %s'%(fp), is_connect
    # buttom top
    #node_data, node_app = bipartite.sets(g)
    node_data = set(n for n, d in g.nodes(data=True) if d['bipartite'] == 1)
    node_app = set(g) - node_data
    ## centrality degree
    if centrality_type == 'degree':
        try:
            centrality = bipartite.degree_centrality(g, node_data)
            result = get_centrality_out(fp, node_data, node_app,  centrality, centrality_type, perm_maps)
            return result, is_connect
        except Exception as e:
            print '** error in centrality: %s : %s'%(centrality_type, fp), e 
    ## centrality closeness
    if centrality_type == 'closeness':
        try:
            centrality = bipartite.closeness_centrality(g, node_app, normalized=False)
            result = get_centrality_out(fp, node_data, node_app,  centrality, centrality_type, perm_maps)
            return result, is_connect
        except Exception as e:
            print '**** error in centrality : %s : %s'%(centrality_type, fp), e
    ## centrality betweenness
    if centrality_type == 'betweenness':
        try:
            centrality = bipartite.betweenness_centrality(g, node_app)
            result = get_centrality_out(fp, node_data, node_app,  centrality, centrality_type, perm_maps)
            return result, is_connect
        except Exception as e:
            print '**** error in centrality : %s : %s'%(centrality_type, fp), e
    if centrality_type == 'clustering':
        try:
            centrality = bipartite.clustering(g, node_data, mode='dot')
            result = get_centrality_out(fp, node_data, node_app,  centrality, centrality_type, perm_maps)
            return result, is_connect
        except Exception as e:
            print '**** error in centrality : %s : %s'%(centrality_type, fp), e
def test_bad_mode():
    bipartite.clustering(nx.path_graph(4), mode='foo')
def test_not_bipartite():
    bipartite.clustering(nx.complete_graph(4))
Beispiel #12
0
def test_bad_mode():
    with pytest.raises(nx.NetworkXError):
        bipartite.clustering(nx.path_graph(4), mode="foo")
Beispiel #13
0
def write_developer_contrib_df(fname='data/developer_contributions_df.csv'):
    ids = utils.UniqueIdGenerator()
    peps = [pep for pep in get_peps() if pep.created is not None]
    connectivity = utils.load_result_pkl(connectivity_file)
    centrality = utils.load_result_pkl(centrality_file)
    networks_gen = networks_by_year()
    skip = next(networks_gen)
    networks = list(networks_gen)
    years = range(1992, 2015)
    devs_by_year = get_developers_by_years(networks=networks)
    with open(fname, 'wb') as f:
        out = csv.writer(f)
        out.writerow([
            'id', 'year', 'dev', 'has_written_peps', 'has_written_acc_peps',
            'is_delegate', 'peps_this_year', 'total_peps',
            'accepted_peps_year', 'total_accepted_peps',
            'degree', 'contributions_sc', 'contributions_edits',
            'contributions_added', 'contributions_deleted',
            'collaborators', 'knum', 'aknum', 'top', 'top2',
            'tenure', 'betweenness', 'closeness', 'degree_cent',
            'file_mean_degree', 'clus_sq', 'clus_dot', 'clus_red',
        ])
        for year, G in zip(years, networks):
            print("Analyzing {}".format(G.name))
            bdfl_delegates = get_delegates_by_year(year, peps=peps)
            peps_this_year = peps_by_developer_that_year(year, peps=peps)
            peps_until_year = peps_by_developer_until_year(year, peps=peps)
            acc_peps_this_year = accepted_peps_by_developer_that_year(year, peps=peps)
            acc_peps_until_year = accepted_peps_by_developer_until_year(year, peps=peps)
            top = get_developers_top_connectivity_by_year(G, year,
                                                          connectivity=connectivity)
            top2 = get_developers_top_connectivity_by_year_new(G, year,
                                                               connectivity=connectivity)
            devs = devs_by_year[year]
            tenure = compute_tenure_by_year(year, networks=networks)
            k_num = connectivity[year]['k_num']
            bet = normalize(centrality[year]['bet'])
            clos = normalize(centrality[year]['clos'])
            deg = normalize(centrality[year]['deg'])
            clus_sq = nx.square_clustering(G)
            clus_dot = bp.clustering(G)
            clus_red = bp.node_redundancy(G)
            for dev in devs:
                out.writerow([
                    ids[dev],
                    year,
                    dev.encode('utf8'),
                    1 if dev in peps_until_year else 0, # developer has written at least a pep
                    1 if dev in acc_peps_until_year else 0, # developer has written at least an acc. pep
                    1 if dev in bdfl_delegates else 0, # developer has been BDFL delegate
                    peps_this_year[dev] if dev in peps_this_year else 0, # peps written this year
                    peps_until_year[dev] if dev in peps_until_year else 0, # peps written until this year
                    acc_peps_this_year[dev] if dev in acc_peps_this_year else 0, # peps acc. this year
                    acc_peps_until_year[dev] if dev in acc_peps_until_year else 0, # total peps acc.
                    len(G[dev]), #G.degree(dev, weight=None),
                    G.degree(dev, weight='weight'), # lines of code added plus deleted
                    G.degree(dev, weight='edits'), # number files edit
                    G.degree(dev, weight='added'), # lines of code added
                    G.degree(dev, weight='deleted'), # lines of code removed
                    second_order_nbrs(G, dev), # second order neighbors
                    k_num[dev][0], # k-component number
                    k_num[dev][1], # Average k-component number
                    1 if dev in top else 0, # top connectivity level
                    1 if dev in top2 else 0, # top 2 connectivity level
                    tenure[dev],
                    bet[dev],
                    clos[dev],
                    deg[dev],
                    sum(len(G[n]) for n in G[dev]) / float(len(G[dev])),
                    clus_sq[dev],
                    clus_dot[dev],
                    clus_red[dev],
                ])
    for line in dataFile:
        length = len(line.split(","))
        if length != 8:
            continue;
        srcIp = line.split(",")[1]
        destIp = line.split(",")[length -2]
        edgeArr.append((srcIp,destIp))

    print('File Read , Now Creating Graph for Day val = ',dayVal)

    G.add_edges_from(edgeArr)
    print('Edges created')

    # redundancyMap = bipartite.node_redundancy(G)

    redundancyMap = bipartite.clustering(G)
    # print(redundancyMap)

    print('Redundancy done for Day = ',dayVal)

    valueMap = {}
    for el in redundancyMap:
        value = redundancyMap[el]

        if value not in valueMap:
            valueMap[value] = 0

        valueMap[value] += 1

    for el in valueMap:
        writeFile.write("{},{},{}".format(dayVal,el,valueMap[el]))
Beispiel #15
0
ax.xaxis.set_ticks_position('bottom')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
#plt.show()

# ADD PLOTS OF DEGREE DISTRIBUTION CONSIDERING EACH EDGE TYPE (3 PLOTS)

# Connectedness
numSCC = nx.number_strongly_connected_components(network)
numWCC = nx.number_weakly_connected_components(network)

# Clustering
# No C3 clustering by definition of bipartite, elaborate and explain C4 during talk
cluster1 = nx.square_clustering(
    network)  # No clustering because edges only go from users to designs
cluster2 = bipartite.clustering(
    network)  # No clustering because edges only go from users to designs

# Centrality Measures
# Do these factor in directedness!!!!!!!!!!!!!!!!!!!!!!!!!???????????????????????
closeness_centrality = bipartite.closeness_centrality(network, users)
total_closeness_centrality = 0
for key, value in closeness_centrality.items():
    total_closeness_centrality += value
avg_closeness_centrality = total_closeness_centrality / len(
    closeness_centrality)

degree_centrality = bipartite.degree_centrality(network, users)
total_degree_centrality = 0
for key, value in degree_centrality.items():
    total_degree_centrality += value
avg_degree_centrality = total_degree_centrality / len(degree_centrality)
        # sourceIpSet.add(srcIp)
        # destIpSet.add(destIp)
        #
        # ipMap[srcIp].add(destIp)
        edgeArr.append((srcIp, destIp))

    # for key in ipMap.keys():
    #     if(len(ipMap[key]) > 1):
    #         print("{},{}".format(key,len(ipMap[key])))

    G.add_edges_from(edgeArr)

    print('xxx')

    arr = bipartite.clustering(G)
    for node in arr.keys():
        coefficient = arr[node]
        if coefficient not in coefficientMap:
            coefficientMap[coefficient] = []

        coefficientMap[coefficient].append(node)

    print('Clustering done for day = ' + str(dayVal))

    for el in coefficientMap:
        if el not in glocalCoeffChangeMap:
            glocalCoeffChangeMap[el] = []

        glocalCoeffChangeMap[el].append(len(coefficientMap[el]))
writeFile = open("../dataFiles/bipartiteClusteringDayWise.csv", "w")
for dayVal in range(1, 16):
    dataFile = open("../dataFiles/sipscan-" + str(dayVal))
    print("Parsing Day " + str(dayVal) + " data")
    dayMap[dayVal] = {}
    edgeArr = []
    graph = nx.Graph()
    for line in dataFile:
        length = len(line.split(","))
        srcIp = line.split(",")[1]
        destIp = line.split(",")[length - 2]

        edgeArr.append((srcIp, destIp))

    graph.add_edges_from(edgeArr)
    clusterVal = bipartite.clustering(graph, mode="dot")
    print("Clustering Done for day = ", dayVal)
    for el in clusterVal:
        # if el not in ipMap:
        #     ipMap[el] = []
        #
        # ipMap[el].append(clusterVal[el])

        writeFile.write("{},{},{}".format(str(dayVal), str(el),
                                          clusterVal[el]))
        writeFile.write("\n")

    writeFile.flush()
    print("Writing Done for day = ", dayVal)

# for el in ipMap:
Beispiel #18
0
def test_not_bipartite():
    bipartite.clustering(nx.complete_graph(4))
def bipartite_analysis(members, prods, graph):
    print bipartite.density(graph, members)
    print bipartite.density(graph, prods)
    return bipartite.clustering(graph, members)
Beispiel #20
0
def test_bad_mode():
    bipartite.clustering(nx.path_graph(4), mode='foo')
Beispiel #21
0
def build_survival_data_frame(fname=survival_file):
    nan = float('nan')
    ids = utils.UniqueIdGenerator()
    connectivity = utils.load_result_pkl(connectivity_file)
    centrality = utils.load_result_pkl(centrality_file)
    peps = [pep for pep in get_peps() if pep.created is not None]
    networks = list(networks_by_year())
    devs = get_developers_by_years(networks=networks)
    skip = networks.pop(0) # skip 1991
    G_start = networks.pop(0) # start with 1992
    devs_start = set(n for n, d in G_start.nodes(data=True) if d['bipartite']==1)
    years = range(1993, 2015)
    with open(fname, 'wb') as f:
        out = csv.writer(f)
        out.writerow([
            'id', 'dev', 'period', 'rstart', 'rstop', 'status',
            'has_written_peps', 'has_written_acc_peps',
            'peps_this_year', 'total_peps',
            'accepted_peps_year', 'total_accepted_peps',
            'biconnected', 'top', 'tenure', 'colaborators',
            'knum', 'aknum', 'clus_sq', 'clus_dot', 'clus_red',
            'degree', 'contributions', 'dcentrality',
            'betweenness', 'closeness',
        ])
        previous_devs = devs_start
        previous_year = 1992
        previous_G = G_start
        for i, (year, G) in enumerate(zip(years, networks)):
            print("processing year {}".format(previous_year))
            clus_sq = nx.square_clustering(previous_G)
            these_devs = devs[year]
            remaining_devs = get_all_remaining_devs(devs, years[i:])
            top_devs = get_developers_top_connectivity(
                connectivity[previous_year]['k_components'], 
                previous_devs)
            tenure = compute_tenure_by_year(previous_year)
            bet = normalize(centrality[previous_year]['bet'])
            clos = normalize(centrality[previous_year]['bet'])
            deg = normalize(centrality[previous_year]['deg'])
            clus_sq = nx.square_clustering(previous_G)
            clus_dot = bp.clustering(previous_G)
            clus_red = bp.node_redundancy(previous_G)
            peps_this_year = peps_by_developer_that_year(previous_year, peps=peps)
            peps_until_year = peps_by_developer_until_year(previous_year, peps=peps)
            acc_peps_this_year = accepted_peps_by_developer_that_year(previous_year, peps=peps)
            acc_peps_until_year = accepted_peps_by_developer_until_year(previous_year, peps=peps)
            for dev in previous_devs:
                out.writerow([
                    ids[dev], # developer numerical ID
                    dev.encode('utf8'), # developer name
                    i + 1, # period
                    i, # start
                    i + 1, # stop
                    0 if dev in remaining_devs else 1, # status (censored)
                    1 if dev in peps_until_year else 0, # developer has written at least a pep
                    1 if dev in acc_peps_until_year else 0, # developer has written at least an acc. pep
                    peps_this_year[dev] if dev in peps_this_year else 0, # peps written this year
                    peps_until_year[dev] if dev in peps_until_year else 0, # peps written until this year
                    acc_peps_this_year[dev] if dev in acc_peps_this_year else 0, # peps acc. this year
                    acc_peps_until_year[dev] if dev in acc_peps_until_year else 0, # total peps acc.
                    0 if connectivity[previous_year]['k_num'][dev][0] < 2 else 1,#biconnected
                    0 if dev not in top_devs else 1, # member of the top connectivity level
                    tenure[dev], # tenure in years
                    second_order_nbrs(previous_G, dev), # collaborators
                    connectivity[previous_year]['k_num'].get(dev, (nan,nan))[0], # knum
                    connectivity[previous_year]['k_num'].get(dev, (nan,nan))[1], # aknum
                    clus_sq.get(dev, nan),
                    clus_dot.get(dev, nan),
                    clus_red.get(dev, nan),
                    previous_G.degree(dev), # degree
                    previous_G.degree(dev, weight='weight'), # contributions
                    deg.get(dev, nan),
                    bet.get(dev, nan),
                    clos.get(dev, nan),
            ])
            previous_devs = these_devs
            previous_year = year
            previous_G = G
Beispiel #22
0
 def clustering(self):
     self.clustering_dict = bi.clustering(self.G)
writeFile = open("../dataFiles/bipartiteClusteringDayWise.csv","w")
for dayVal in range(1,16):
    dataFile = open("../dataFiles/sipscan-"+str(dayVal))
    print("Parsing Day "+str(dayVal)+" data")
    dayMap[dayVal] = {}
    edgeArr = []
    graph = nx.Graph()
    for line in dataFile:
        length = len(line.split(","))
        srcIp = line.split(",")[1]
        destIp = line.split(",")[length -2]

        edgeArr.append((srcIp,destIp))

    graph.add_edges_from(edgeArr)
    clusterVal = bipartite.clustering(graph,mode="dot")
    print("Clustering Done for day = ",dayVal)
    for el in clusterVal:
        # if el not in ipMap:
        #     ipMap[el] = []
        #
        # ipMap[el].append(clusterVal[el])

        writeFile.write("{},{},{}".format(str(dayVal),str(el),clusterVal[el]))
        writeFile.write("\n")

    writeFile.flush()
    print("Writing Done for day = ",dayVal)

# for el in ipMap:
#     writeFile.write("{},{}".format(str(el),"##".join([str(x) for x in ipMap[el]])))
Beispiel #24
0
    for line in dataFile:
        length = len(line.split(","))
        if length != 8:
            continue
        srcIp = line.split(",")[1]
        destIp = line.split(",")[length - 2]
        edgeArr.append((srcIp, destIp))

    print('File Read , Now Creating Graph for Day val = ', dayVal)

    G.add_edges_from(edgeArr)
    print('Edges created')

    # redundancyMap = bipartite.node_redundancy(G)

    redundancyMap = bipartite.clustering(G)
    # print(redundancyMap)

    print('Redundancy done for Day = ', dayVal)

    valueMap = {}
    for el in redundancyMap:
        value = redundancyMap[el]

        if value not in valueMap:
            valueMap[value] = 0

        valueMap[value] += 1

    for el in valueMap:
        writeFile.write("{},{},{}".format(dayVal, el, valueMap[el]))
def bipartite_analysis(members, prods, graph):
    print bipartite.density(graph, members)
    print bipartite.density(graph, prods)
    return bipartite.clustering(graph, members)
Beispiel #26
0
def test_not_bipartite():
    with pytest.raises(nx.NetworkXError):
        bipartite.clustering(nx.complete_graph(4))
        print("Density ICD Nodes (Diseases): " +
              str(bipartite.density(G, nodes_0)))
        print("\n")
        print("Density ATC Nodes (Active Substances): " +
              str(bipartite.density(G, nodes_1)))
        print("\n")
        print('Calculating mean degree ...')
        print("\n")
        G_deg = nx.degree_histogram(G)
        G_deg_sum = [a * b for a, b in zip(G_deg, range(0, len(G_deg)))]
        print('average degree: {}'.format(
            sum(G_deg_sum) / G.number_of_nodes()))
        print("\n")
        print('Calculating mean clustering ...')
        print("\n")
        cluster_g = bipartite.clustering(G)
        scg = 0
        for i in range(len(cluster_g)):
            scg = scg + list(cluster_g.items())[i][1]
        print("Average clustering %s" % str(scg / len(cluster_g)))
        print("\n")
    else:
        print("Nodes Number : " + str(GP.number_of_nodes()))
        print("\n")
        print("Edges Number : " + str(GP.number_of_edges()))
        print("\n")

        print('Calculating density ...')
        print("\n")
        components = sorted(nx.connected_components(GP), key=len, reverse=True)
        largest_component = components[0]
        # sourceIpSet.add(srcIp)
        # destIpSet.add(destIp)
        #
        # ipMap[srcIp].add(destIp)
        edgeArr.append((srcIp,destIp))


    # for key in ipMap.keys():
    #     if(len(ipMap[key]) > 1):
    #         print("{},{}".format(key,len(ipMap[key])))

    G.add_edges_from(edgeArr)

    print('xxx')

    arr = bipartite.clustering(G)
    for node in arr.keys():
        coefficient = arr[node]
        if coefficient not in coefficientMap:
            coefficientMap[coefficient] = []

        coefficientMap[coefficient].append(node)

    print('Clustering done for day = '+str(dayVal))

    for el in coefficientMap:
        if el not in glocalCoeffChangeMap:
            glocalCoeffChangeMap[el] = []

        glocalCoeffChangeMap[el].append(len(coefficientMap[el]))