Esempio n. 1
0
def plot_entropy_ccdf():
    entropy = read_pickle('output/normalized_entropy.obj')

    fig = plt.figure()
    ax = fig.add_subplot(111)

    powerlaw.plot_ccdf(entropy, ax, label='normalized entropy')
    # further plotting
    ax.set_xlabel("Normalized entropy e")
    ax.set_ylabel("Pr(X>=e)")
    plt.legend(fancybox=True, loc='lower left', ncol=1, prop={'size': 5})

    plt.tight_layout()
    plt.savefig('output/normalized_entropy_distribution_ccdf.pdf')

    fig = plt.figure()
    ax = fig.add_subplot(111)

    powerlaw.plot_cdf(entropy, ax, label='normalized entropy', color='r')
    # further plotting
    ax.set_xlabel("Normalized entropy e")
    ax.set_ylabel("Pr(X<=e)")
    plt.legend(fancybox=True, loc='lower left', ncol=1, prop={'size': 5})

    plt.tight_layout()
    plt.savefig('output/normalized_entropy_distribution_cdf.pdf')
Esempio n. 2
0
def plot_entropy_ccdf():
    entropy = read_pickle('output/normalized_entropy.obj')


    fig = plt.figure()
    ax = fig.add_subplot(111)


    powerlaw.plot_ccdf(entropy, ax, label='normalized entropy')
    # further plotting
    ax.set_xlabel("Normalized entropy e")
    ax.set_ylabel("Pr(X>=e)")
    plt.legend(fancybox=True, loc='lower left', ncol=1,prop={'size':5})

    plt.tight_layout()
    plt.savefig('output/normalized_entropy_distribution_ccdf.pdf')

    fig = plt.figure()
    ax = fig.add_subplot(111)

    powerlaw.plot_cdf(entropy, ax, label='normalized entropy',color='r')
    # further plotting
    ax.set_xlabel("Normalized entropy e")
    ax.set_ylabel("Pr(X<=e)")
    plt.legend(fancybox=True, loc='lower left', ncol=1,prop={'size':5})

    plt.tight_layout()
    plt.savefig('output/normalized_entropy_distribution_cdf.pdf')
Esempio n. 3
0
def plot_counts_category_distributions_ccdf():
    category_distributions = read_pickle(
        HOME + 'output/category_counts_distribution.obj')

    for i in category_distributions.values():
        print len(i)

    colors = {
        'lead': 'r',
        'infobox': 'b',
        'body': 'g',
        'left-body': 'm',
        'navbox': 'c',
        'counts': 'k'
    }

    fig = plt.figure()
    ax = fig.add_subplot(111)

    for category in [
            'lead', 'infobox', 'body', 'left-body', 'navbox', 'counts'
    ]:

        data = category_distributions[category]
        data = [x[0] for x in data]
        powerlaw.plot_ccdf(data, ax, label=category, color=colors[category])
    # further plotting
    ax.set_xlabel("Number of clicks n")
    ax.set_ylabel("Pr(X>=n)")
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size': 4})
    #leg = plt.gca().get_legend()
    #ltext  = leg.get_texts()  # all the text.Text instance in the legend
    #llines = leg.get_lines()
    #plt.setp(ltext, fontsize='small')    # the legend text fontsize
    #plt.setp(llines, linewidth=1)
    plt.tight_layout()
    plt.savefig('output/category_counts_distributions.pdf')

    data = category_distributions['counts']
    data = [int(x[0]) for x in data]

    hist, bin_edges = np.histogram(data, 100, density=True)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(bin_edges[:-1], hist, marker='o')
    ax.set_xlabel('#Counts')
    ax.set_ylabel('#Pages')
    ax.set_yscale('log')
    ax.set_xscale('log')
    plt.legend(fancybox=True, loc=3, prop={'size': 4})
    plt.tight_layout()
    plt.savefig('output/counts_distribution.pdf')
def plot_counts_category_distributions_ccdf():
    category_distributions = read_pickle(HOME+'output/category_counts_distribution.obj')

    for  i in category_distributions.values():
        print len(i)

    colors= {'lead':'r','infobox':'b', 'body':'g',  'left-body':'m','navbox':'c', 'counts':'k'}


    fig = plt.figure()
    ax = fig.add_subplot(111)

    for category in ['lead', 'infobox', 'body', 'left-body', 'navbox', 'counts']:

        data = category_distributions[category]
        data = [x[0] for x in data]
        powerlaw.plot_ccdf(data, ax, label=category,color=colors[category])
    # further plotting
    ax.set_xlabel("Number of clicks n")
    ax.set_ylabel("Pr(X>=n)")
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size':4})
    #leg = plt.gca().get_legend()
    #ltext  = leg.get_texts()  # all the text.Text instance in the legend
    #llines = leg.get_lines()
    #plt.setp(ltext, fontsize='small')    # the legend text fontsize
    #plt.setp(llines, linewidth=1)
    plt.tight_layout()
    plt.savefig('output/category_counts_distributions.pdf')

    data = category_distributions['counts']
    data = [int(x[0]) for x in data]

    hist, bin_edges = np.histogram(data, 100, density=True)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot( bin_edges[:-1],hist, marker='o')
    ax.set_xlabel('#Counts')
    ax.set_ylabel('#Pages')
    ax.set_yscale('log')
    ax.set_xscale('log')
    plt.legend(fancybox=True, loc=3,  prop={'size':4})
    plt.tight_layout()
    plt.savefig('output/counts_distribution.pdf')
def plot_ccdf(place, points, thresholds=None):
    """
    plot ccdf
    :param place: tuple
        (name [string], (north_lat [float], south_lat [float], east_lon [float], west_lon [float]))
    :param points: list of tuples (length [float], asymmetry_factor [float])
    :param thresholds: list of floats of minimum length threshold to filter points and plot ccdf
    :return:
    """
    print('     Plotting ccdf ...')
    name, bbox = place
    cpoints = points
    # cpoints = points.copy()  # careful with overwriting list vs. memory overflow
    if thresholds is None:
        thresholds = [0, 250, 500, 1000, 1500, 3000, 4500]
    cmap = plt.get_cmap('Set1')
    colors = [cmap(i) for i in np.linspace(0, 1, len(thresholds))]
    plt.figure()
    for idx, threshold in enumerate(thresholds):
        above_threshold = []
        asymmetry_factors = []
        while cpoints:
            point = cpoints.pop()
            length, asymmetry_factor = point
            if length >= threshold:
                above_threshold.append(point)
                asymmetry_factors.append(asymmetry_factor)
        powerlaw.plot_ccdf(asymmetry_factors,
                           color=colors[idx],
                           linewidth=1.5,
                           label='$Length \geq {0} \ m$'.format(threshold))
        cpoints = above_threshold
    plt.xlabel('Asymmetry Factor', fontsize=16)
    plt.ylabel('$P(X \geq x)$', fontsize=16)
    plt.legend()
    plt.grid()
    plt.savefig('./figs_dir/ccdf {0}.png'.format(name), format='png', dpi=200)
    print('         Done!')
Esempio n. 6
0
	def plotView(self):
		self.clearView()

		f = Figure(figsize=(5,4), dpi=100)
		a = f.add_subplot(111)
		test = powerlaw.plot_ccdf(self.orderedFreq.values(), ax = a, color = 'b')
		a.plot()
	
		canvas = FigureCanvasTkAgg(f, master=self)
		canvas.show()
		canvas.get_tk_widget().pack(side=TOP, fill=BOTH, expand=1)


		toolbar = NavigationToolbar2TkAgg( canvas, self )
		toolbar.update()
		canvas._tkcanvas.pack(side=TOP, fill=BOTH, expand=1)
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    sample_cascade_size = {}
    sample_inter_arrival_time = []
    sample_cascade_influence = {}
    sample_cascade_influence_10m = defaultdict(int)
    sample_cascade_influence_1h = defaultdict(int)
    with open('../data/{0}_out/sample_retweet_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            root_tweet, cascades = line.rstrip().split(':')
            cascades = cascades.split(',')
            root_tweet = root_tweet.split('-')[0]
            retweets = [x.split('-')[0] for x in cascades]
            influences = [int(x.split('-')[1]) for x in cascades]
            sample_cascade_size[root_tweet] = len(retweets)
            sample_cascade_influence[root_tweet] = sum(influences)
            root_timestamp = melt_snowflake(root_tweet)[0] / 1000
            retweet_timestamp_list = [root_timestamp]

            for i in range(len(retweets)):
                retweet_time = melt_snowflake(retweets[i])[0] / 1000
                relative_retweet_time = retweet_time - root_timestamp
                retweet_timestamp_list.append(
                    melt_snowflake(retweets[i])[0] / 1000)
                if relative_retweet_time < 10 * 60:
                    sample_cascade_influence_10m[root_tweet] += influences[i]
                if relative_retweet_time < 60 * 60:
                    sample_cascade_influence_1h[root_tweet] += influences[i]

            for i in range(len(retweet_timestamp_list) - 1):
                sample_inter_arrival_time.append(retweet_timestamp_list[i +
                                                                        1] -
                                                 retweet_timestamp_list[i])

    complete_cascade_size = {}
    complete_inter_arrival_time = []
    complete_cascade_influence = {}
    complete_cascade_influence_10m = defaultdict(int)
    complete_cascade_influence_1h = defaultdict(int)
    with open('../data/{0}_out/complete_retweet_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            root_tweet, cascades = line.rstrip().split(':')
            cascades = cascades.split(',')
            root_tweet = root_tweet.split('-')[0]
            retweets = [x.split('-')[0] for x in cascades]
            complete_cascade_size[root_tweet] = len(retweets)
            if len(retweets) >= 50:
                influences = [int(x.split('-')[1]) for x in cascades]
                complete_cascade_influence[root_tweet] = sum(influences)
                root_timestamp = melt_snowflake(root_tweet)[0] / 1000
                retweet_timestamp_list = [root_timestamp]

                for i in range(len(retweets)):
                    retweet_time = melt_snowflake(retweets[i])[0] / 1000
                    relative_retweet_time = retweet_time - root_timestamp
                    retweet_timestamp_list.append(
                        melt_snowflake(retweets[i])[0] / 1000)
                    if relative_retweet_time < 10 * 60:
                        complete_cascade_influence_10m[
                            root_tweet] += influences[i]
                    if relative_retweet_time < 60 * 60:
                        complete_cascade_influence_1h[
                            root_tweet] += influences[i]

                for i in range(len(retweet_timestamp_list) - 1):
                    complete_inter_arrival_time.append(
                        retweet_timestamp_list[i + 1] -
                        retweet_timestamp_list[i])

    print('number of cascades in the complete set', len(complete_cascade_size))
    print('number of cascades in the sample set', len(sample_cascade_size))

    print('mean complete size', np.mean(list(complete_cascade_size.values())))
    print('mean sample size', np.mean(list(sample_cascade_size.values())))

    print('complete #cascades (≥50 retweets)',
          sum([1 for x in list(complete_cascade_size.values()) if x >= 50]))
    print('sample #cascades (≥50 retweets)',
          sum([1 for x in list(sample_cascade_size.values()) if x >= 50]))

    num_complete_cascades_in_sample = 0
    complete_cascades_in_sample_size_list = []
    num_complete_cascades_in_sample_50 = 0
    for root_tweet in sample_cascade_size:
        if sample_cascade_size[root_tweet] == complete_cascade_size[
                root_tweet]:
            num_complete_cascades_in_sample += 1
            complete_cascades_in_sample_size_list.append(
                complete_cascade_size[root_tweet])
            if complete_cascade_size[root_tweet] >= 50:
                num_complete_cascades_in_sample_50 += 1
    print('number of complete cascades in the sample set',
          num_complete_cascades_in_sample)
    print('number of complete cascades (>50 retweets) in the sample set',
          num_complete_cascades_in_sample_50)
    print('max: {0}, mean: {1}'.format(
        max(complete_cascades_in_sample_size_list),
        np.mean(complete_cascades_in_sample_size_list)))

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]
    red = cc4[3]

    sample_median = np.median(sample_inter_arrival_time)
    complete_median = np.median(complete_inter_arrival_time)

    plot_ccdf(sample_inter_arrival_time,
              ax=axes[0],
              color=blue,
              ls='-',
              label='sample')
    plot_ccdf(complete_inter_arrival_time,
              ax=axes[0],
              color='k',
              ls='-',
              label='complete')

    axes[0].plot([sample_median, sample_median], [0, 1],
                 color=blue,
                 ls='--',
                 lw=1)
    axes[0].plot([complete_median, complete_median], [0, 1],
                 color='k',
                 ls='--',
                 lw=1)

    print('\ninter_arrival_time sample median', sample_median)
    print('inter_arrival_time complete median', complete_median)

    axes[0].set_xscale('symlog')
    axes[0].set_xticks([0, 1, 100, 10000, 1000000])
    axes[0].set_yscale('linear')
    axes[0].set_xlabel('inter-arrival time (sec)', fontsize=16)
    axes[0].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='upper right')
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', fontsize=18, pad=-3 * 72, y=1.0001)

    influence_list = []
    influence_list_10m = []
    influence_list_1h = []
    for root_tweet in sample_cascade_size:
        if complete_cascade_size[root_tweet] >= 50:
            if complete_cascade_influence[root_tweet] > 0:
                influence_list.append(sample_cascade_influence[root_tweet] /
                                      complete_cascade_influence[root_tweet])
            if complete_cascade_influence_10m[root_tweet] > 0:
                influence_list_10m.append(
                    sample_cascade_influence_10m[root_tweet] /
                    complete_cascade_influence_10m[root_tweet])
            if complete_cascade_influence_1h[root_tweet] > 0:
                influence_list_1h.append(
                    sample_cascade_influence_1h[root_tweet] /
                    complete_cascade_influence_1h[root_tweet])

    plot_ccdf(influence_list_10m, ax=axes[1], color=red, ls='-', label='10m')
    plot_ccdf(influence_list_1h, ax=axes[1], color=blue, ls='-', label='1h')
    plot_ccdf(influence_list, ax=axes[1], color='k', ls='-', label='14d')

    print('influence_list median', np.median(influence_list))
    print('influence_list_1h median', np.median(influence_list_1h))
    print('influence_list_10m median', np.median(influence_list_10m))

    print('influence_list 0.25', percentileofscore(influence_list, 0.25))
    print('influence_list 0.25', percentileofscore(influence_list_1h, 0.25))
    print('influence_list 0.25', percentileofscore(influence_list_10m, 0.25))

    print('influence_list 0.75', percentileofscore(influence_list, 0.75))
    print('influence_list 0.75', percentileofscore(influence_list_1h, 0.75))
    print('influence_list 0.75', percentileofscore(influence_list_10m, 0.75))

    axes[1].set_xscale('linear')
    axes[1].set_yscale('linear')
    axes[1].set_xlabel('relative potential reach', fontsize=16)
    # axes[1].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[1].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='upper right')
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', fontsize=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/cascades_measures.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
Esempio n. 8
0
for x in range(0, max(degree) + 1):
    dist.append(degree.count(x) / float(g.vcount()))

    #log
plt.xscale('log')
plt.yscale('log')
plt.xlabel('K')
plt.ylabel('Pk')

plt.plot(dist, linestyle=(0, (1, 3)))

results = pl.Fit(degree)
print("Alpha is {}".format(results.power_law.alpha))
print("Xmin is {}".format(results.power_law.xmin))

pl.plot_ccdf(degree, color='r')

plt.show()

print("The network diameter is {}".format(g.diameter()))

print('top betweenness')

#centrality top10
between = g.betweenness()
between.sort()
print(between[-10:])

print(g.vs.find(_degree=2)["id"])
degree.sort()
maxd = degree[-10:]
    weight = float(route[2][:-1])
    G.add_edge(airport1, airport2, weight=weight)

N = len(G)
L = len(G.edges())
degrees = nx.degree(G).values()
kmax = max(degrees)
kmin = min(degrees)
kavg = 1.0 * sum(degrees) / len(degrees)
print "Number of nodes:", N
print "Number of links:", L
print "Max degree:", kmax
print "Min degree:", kmin
print "Average degree:", kavg

powerlaw.plot_ccdf(degrees, marker="o", color="b", linestyle="none")
plt.ylabel(r"Cummulative $P(k)$", fontsize=16)
plt.xlabel(r"$k$", fontsize=16)
plt.savefig("Degree distribution" + "_Airline" + ".png")
# plt.show()

"""random walk"""


def rndWalk(G, T):
    node_list = G.nodes()
    """initialization"""
    walker_path = []
    walker_path.append(random.choice(node_list))
    """let's walk!"""
    for t in range(1, T + 1):
def plot_stats():
    # wikipedia  graph  structural statistics
    print 'before load'
    network = load_graph("output/wikipedianetwork.xml.gz")
    print 'after load'
    out_hist = vertex_hist(network, "out")
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(out_hist[1][:-1], out_hist[0], marker='o')
    plt.xlabel('Out-degree')
    plt.ylabel('Frequency')
    plt.gca().set_ylim([1, 10**6])
    #plt.title('Out-degree Distribution')
    plt.tight_layout()
    plt.savefig('output/wikipedia-out-deg-dist.pdf')

    plt.clf()

    in_hist = vertex_hist(network, "in")
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(in_hist[1][:-1], in_hist[0], marker='o')
    plt.xlabel('In-degree')
    plt.ylabel('Frequency')
    plt.gca().set_ylim([1, 10**6])
    #plt.title('In-degree Distribution')
    plt.tight_layout()
    plt.savefig('output/wikipedia-in-deg-dist.pdf')

    plt.clf()

    total_hist = vertex_hist(network, "total")

    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(total_hist[1][:-1], total_hist[0], marker='o')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')
    plt.gca().set_ylim([1, 10**6])
    #plt.title('Degree Distribution')
    plt.tight_layout()
    plt.savefig('output/wikipedia-deg-dist.pdf')

    plt.clf()

    clust = network.vertex_properties["local_clust"]
    #clust = local_clustering(network, undirected=False)

    #hist, bin_edges = np.histogram(clust.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)

    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Local Clustering Coefficient C')
    #plt.ylabel('P(x<=C)')
    #plt.title('Clustering Coefficient Distribution')
    #plt.savefig('output/wikipedia-clust-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(clust.get_array(), ax)
    #ax.set_title('Clustering Coefficient Distribution')
    ax.set_xlabel('Local Clustering Coefficient $C')
    ax.set_ylabel('P(x<=C)')
    ax.set_ylim([0, 1])
    fig.tight_layout()
    fig.savefig('output/wikipedia-clust-cdf.pdf')

    plt.clf()


    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(clust.get_array(), ax)
    #ax.set_title('Clustering Coefficient Distribution')
    ax.set_xlabel('Local Clustering Coefficient C')
    ax.set_ylabel('P(x>=C)')
    ax.set_ylim([10**-4, 10**-0.5])
    fig.tight_layout()
    fig.savefig('output/wikipedia-clust-ccdf.pdf')

    plt.clf()

    prank = network.vertex_properties["page_rank"]

    #hist, bin_edges = np.histogram(prank.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)

    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Page rank Pr')
    #plt.ylabel('P(x<=Pr)')
    #plt.title('Page rank Distribution')
    #plt.savefig('output/wikipedia-prank-cdf.pdf')
    fig, ax = plt.subplots()
    powerlaw.plot_cdf(prank.get_array(), ax)
    #ax.set_title('Page Rank Distribution')
    ax.set_xlabel('Page rank Pr')
    ax.set_ylabel('P(x<=Pr)')
    ax.set_ylim([0, 1])
    fig.tight_layout()
    fig.savefig('output/wikipedia-prank-cdf.pdf')
    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(prank.get_array(), ax)
    #ax.set_title('Page Rank Distribution')
    ax.set_xlabel('Page rank Pr')
    ax.set_ylabel('P(x>=Pr)')
    fig.tight_layout()
    fig.savefig('output/wikipedia-prank-ccdf.pdf')

    plt.clf()

    kcore = network.vertex_properties["kcore"]

    #hist, bin_edges = np.histogram(kcore.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)
    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Kcore kC')
    #plt.ylabel('P(x<=kC)')
    #plt.title('K-Core Distribution')
    #plt.savefig('output/wikipedia-kcore-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(kcore.get_array(), ax)
    #ax.set_title('K-Core Distribution')
    ax.set_xlabel('k-Core kC')
    ax.set_ylabel('P(x<=kC)')
    ax.set_ylim([0, 1])
    fig.tight_layout()
    fig.savefig('output/wikipedia-kcore-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(kcore.get_array(), ax)
    #ax.set_title('K-Core Distribution')
    ax.set_xlabel('k-Core kC')
    ax.set_ylabel('P(x>=kC)')
    fig.tight_layout()
    fig.savefig('output/wikipedia-kcore-ccdf.pdf')

    plt.clf()



    eigenvector_centr = network.vertex_properties["eigenvector_centr"]

    #hist, bin_edges = np.histogram(eigenvector_centr.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)
    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Eigenvector Centrality E')
    #plt.ylabel('P(x<=E)')
    #plt.title('Eigenvector Centrality Distribution')
    #plt.savefig('output/wikipedia-eigenvcentr-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(eigenvector_centr.get_array(), ax)
    #ax.set_title('Eigenvector Centrality E')
    ax.set_xlabel('Eigenvector Centrality E')
    ax.set_ylabel('P(x<=E)')
    ax.set_ylim([0, 1])
    fig.tight_layout()
    fig.savefig('output/wikipedia-eigenvcentr-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(eigenvector_centr.get_array(), ax)
    #ax.set_title('Eigenvector Centrality E')
    ax.set_xlabel('Eigenvector Centrality E')
    ax.set_ylabel('P(x>=E)')
    fig.tight_layout()
    fig.savefig('output/wikipedia-eigenvcentr-ccdf.pdf')

    plt.clf()


    colors= {'local_clust':'r','eigenvector_centr':'b', 'page_rank': 'g', 'kcore':'m', 'hub': 'c', 'authority':'k'}
    labels = {'local_clust': 'clust.', 'eigenvector_centr':'eigen. centr.','page_rank': 'page rank', 'kcore': 'kcore', 'hub':'hub', 'authority':'authority'}
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for f in ['local_clust','page_rank', 'hub', 'authority', 'kcore']:
        feature = network.vertex_properties[f]
        powerlaw.plot_cdf(feature.get_array(), ax, label=labels[f],color=colors[f])
    ax.set_xlabel('Feature $f$')
    ax.set_ylabel('$P(X>=f)$')
    ax.set_ylim([0, 1])
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size':4})
    plt.tight_layout()
    plt.savefig('output/wikipedia-features-cdf.pdf')

    plt.clf()
    colors= {'local_clust':'r','eigenvector_centr':'b', 'page_rank': 'g', 'kcore':'m', 'hub': 'c', 'authority':'k'}
    labels = {'local_clust': 'clust.', 'eigenvector_centr':'eigen. centr.','page_rank': 'page rank', 'kcore': 'kcore', 'hub':'hub', 'authority':'authority'}
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for f in ['local_clust','eigenvector_centr','page_rank', 'hub', 'authority', 'kcore']:
        feature = network.vertex_properties[f]
        powerlaw.plot_cdf(feature.get_array(), ax, label=labels[f],color=colors[f])
    ax.set_xlabel('Feature $f$')
    ax.set_ylabel('$P(X<=f)$')
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size':4})
    plt.tight_layout()
    plt.savefig('output/wikipedia-features-ccdf.pdf')


    plt.clf()





    # wikipedia transitions  graph  structural statistics
    print 'before load'
    network_transitions = load_graph("output/transitionsnetwork.xml.gz")
    print 'after load'

    out_hist = vertex_hist(network_transitions, "out")
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(out_hist[1][:-1], out_hist[0], marker='o')
    plt.xlabel('Out-degree')
    plt.ylabel('Frequency')
    plt.gca().set_ylim([1, 10**6])
    #plt.title('Out-degree Distribution')
    plt.savefig('output/wikipedia-transitions-out-deg-dist.pdf')

    plt.clf()

    in_hist = vertex_hist(network_transitions, "in")
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(in_hist[1][:-1], in_hist[0], marker='o')
    plt.xlabel('In-degree')
    plt.ylabel('Frequency')
    #plt.title('In-degree Distribution')
    plt.gca().set_ylim([1, 10**6])
    plt.savefig('output/wikipedia-transitions-in-deg-dist.pdf')

    plt.clf()

    total_hist = vertex_hist(network_transitions, "total")

    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(total_hist[1][:-1], total_hist[0], marker='o')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')
    #plt.title('Degree Distribution')
    plt.gca().set_ylim([1, 10**6])
    plt.savefig('output/wikipedia-transitions-deg-dist.pdf')

    plt.clf()

    #clust = local_clustering(network_transitions, undirected=False)
    clust = network_transitions.vertex_properties["local_clust"]

    #hist, bin_edges = np.histogram(clust.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)
    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Local Clustering Coefficient C')
    #plt.ylabel('P(x<=C)')
    #plt.title('Clustering Coefficient Distribution')
    #plt.savefig('output/wikipedia-transitions-clust-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(clust.get_array(), ax)
    #ax.set_title('Clustering Coefficient Distribution')
    ax.set_xlabel('Local Clustering Coefficient C')
    ax.set_ylabel('P(x<=C)')
    fig.savefig('output/wikipedia-transitions-clust-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(clust.get_array(), ax)
    ax.set_title('Clustering Coefficient Distribution')
    ax.set_xlabel('Local Clustering Coefficient C')
    ax.set_ylabel('P(x>=C)')
    fig.savefig('output/wikipedia-transitions-clust-ccdf.pdf')

    plt.clf()

    prank = network_transitions.vertex_properties["page_rank"]

    #hist, bin_edges = np.histogram(prank.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)
    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Page rank Pr')
    #plt.ylabel('P(x<=Pr)')
    #plt.title('Page rank Distribution')
    #plt.savefig('output/wikipedia-transitions-prank-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(prank.get_array(), ax)
    #ax.set_title('Page Rank Distribution')
    ax.set_xlabel('Page rank Pr')
    ax.set_ylabel('P(x<=Pr)')
    fig.savefig('output/wikipedia-transitions-prank-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(prank.get_array(), ax)
    #ax.set_title('Page Rank Distribution')
    ax.set_xlabel('Page rank Pr')
    ax.set_ylabel('P(x>=Pr)')
    fig.savefig('output/wikipedia-transitions-prank-ccdf.pdf')

    plt.clf()

    kcore = network_transitions.vertex_properties["kcore"]

    #hist, bin_edges = np.histogram(kcore.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)

    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Kcore kC')
    #plt.ylabel('P(x<=kC)')
    #plt.title('K-Core Distribution')
    #plt.savefig('output/wikipedia-transitions-kcore-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(kcore.get_array(), ax)
    #ax.set_title('K-Core Distribution')
    ax.set_xlabel('k-Core kC')
    ax.set_ylabel('P(x<=kC)')
    fig.savefig('output/wikipedia-transitions-kcore-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(kcore.get_array(), ax)
    #ax.set_title('K-Core Distribution')
    ax.set_xlabel('k-Core kC')
    ax.set_ylabel('P(x>=kC)')
    fig.savefig('output/wikipedia-transitions-kcore-ccdf.pdf')

    plt.clf()

    eigenvector_centr = network_transitions.vertex_properties["eigenvector_centr"]

    #hist, bin_edges = np.histogram(eigenvector_centr.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)

    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Eingenvector centrality E')
    #plt.ylabel('P(x<=E)')
    #plt.title('Eigenvector Centrality Distribution')
    #plt.savefig('output/wikipedia-transitions-eigenvcentr-cdf.pdf')


    fig, ax = plt.subplots()
    powerlaw.plot_cdf(eigenvector_centr.get_array(), ax)
    #ax.set_title('Eigenvector Centrality Distribution')
    ax.set_xlabel('Eingenvector centrality E')
    ax.set_ylabel('P(x<=E)')
    fig.savefig('output/wikipedia-transitions-eigenvcentr-cdf.pdf')
    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(eigenvector_centr.get_array(), ax)
    #ax.set_title('Eigenvector Centrality Distribution')
    ax.set_xlabel('Eingenvector centrality E')
    ax.set_ylabel('P(x>=E)')
    fig.savefig('output/wikipedia-transitions-eigenvcentr-ccdf.pdf')
    plt.clf()

    print 'before hits'
    #ee, authority, hub = hits(network_transitions)
    #network_transitions.vertex_properties["authority"] = authority
    #network_transitions.vertex_properties["hub"] = hub
    #network_transitions.save("output/transitionsnetwork.xml.gz")
    print 'after hits'

    colors= {'local_clust':'r','eigenvector_centr':'b', 'page_rank': 'g', 'kcore':'m', 'hub': 'c', 'authority':'k'}
    labels = {'local_clust': 'clust.', 'eigenvector_centr':'eigen. centr.','page_rank': 'page rank', 'kcore': 'kcore', 'hub':'hub', 'authority':'authority'}
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for f in ['local_clust','page_rank', 'hub', 'authority', 'kcore']:
        feature = network_transitions.vertex_properties[f]
        powerlaw.plot_cdf(feature.get_array(), ax, label=labels[f],color=colors[f])
    ax.set_xlabel('Feature $f$')
    ax.set_ylabel('$P(X>=f)$')
    ax.set_ylim([0, 1])
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size':4})
    plt.tight_layout()
    plt.savefig('output/wikipedia-transitions-features-cdf.pdf')
    plt.clf()

    colors= {'local_clust':'r','eigenvector_centr':'b', 'page_rank': 'g', 'kcore':'m', 'hub': 'c', 'authority':'k'}
    labels = {'local_clust': 'clust.', 'eigenvector_centr':'eigen. centr.','page_rank': 'page rank', 'kcore': 'kcore', 'hub':'hub', 'authority':'authority'}
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for f in ['local_clust','page_rank', 'hub', 'authority', 'kcore']:
        feature = network_transitions.vertex_properties[f]
        powerlaw.plot_cdf(feature.get_array(), ax, label=labels[f],color=colors[f])
    ax.set_xlabel('Feature $f$')
    ax.set_ylabel('$P(X<=f)$')
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size':4})
    plt.tight_layout()
    plt.savefig('output/wikipedia-transitions-features-ccdf.pdf')

    plt.clf()
Esempio n. 11
0
def plot_stats():
    # wikipedia  graph  structural statistics
    print 'before load'
    network = load_graph("output/wikipedianetwork.xml.gz")
    print 'after load'
    out_hist = vertex_hist(network, "out")
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(out_hist[1][:-1], out_hist[0], marker='o')
    plt.xlabel('Out-degree')
    plt.ylabel('Frequency')
    plt.gca().set_ylim([1, 10**6])
    #plt.title('Out-degree Distribution')
    plt.tight_layout()
    plt.savefig('output/wikipedia-out-deg-dist.pdf')

    plt.clf()

    in_hist = vertex_hist(network, "in")
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(in_hist[1][:-1], in_hist[0], marker='o')
    plt.xlabel('In-degree')
    plt.ylabel('Frequency')
    plt.gca().set_ylim([1, 10**6])
    #plt.title('In-degree Distribution')
    plt.tight_layout()
    plt.savefig('output/wikipedia-in-deg-dist.pdf')

    plt.clf()

    total_hist = vertex_hist(network, "total")

    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(total_hist[1][:-1], total_hist[0], marker='o')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')
    plt.gca().set_ylim([1, 10**6])
    #plt.title('Degree Distribution')
    plt.tight_layout()
    plt.savefig('output/wikipedia-deg-dist.pdf')

    plt.clf()

    clust = network.vertex_properties["local_clust"]
    #clust = local_clustering(network, undirected=False)

    #hist, bin_edges = np.histogram(clust.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)

    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Local Clustering Coefficient C')
    #plt.ylabel('P(x<=C)')
    #plt.title('Clustering Coefficient Distribution')
    #plt.savefig('output/wikipedia-clust-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(clust.get_array(), ax)
    #ax.set_title('Clustering Coefficient Distribution')
    ax.set_xlabel('Local Clustering Coefficient $C')
    ax.set_ylabel('P(x<=C)')
    ax.set_ylim([0, 1])
    fig.tight_layout()
    fig.savefig('output/wikipedia-clust-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(clust.get_array(), ax)
    #ax.set_title('Clustering Coefficient Distribution')
    ax.set_xlabel('Local Clustering Coefficient C')
    ax.set_ylabel('P(x>=C)')
    ax.set_ylim([10**-4, 10**-0.5])
    fig.tight_layout()
    fig.savefig('output/wikipedia-clust-ccdf.pdf')

    plt.clf()

    prank = network.vertex_properties["page_rank"]

    #hist, bin_edges = np.histogram(prank.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)

    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Page rank Pr')
    #plt.ylabel('P(x<=Pr)')
    #plt.title('Page rank Distribution')
    #plt.savefig('output/wikipedia-prank-cdf.pdf')
    fig, ax = plt.subplots()
    powerlaw.plot_cdf(prank.get_array(), ax)
    #ax.set_title('Page Rank Distribution')
    ax.set_xlabel('Page rank Pr')
    ax.set_ylabel('P(x<=Pr)')
    ax.set_ylim([0, 1])
    fig.tight_layout()
    fig.savefig('output/wikipedia-prank-cdf.pdf')
    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(prank.get_array(), ax)
    #ax.set_title('Page Rank Distribution')
    ax.set_xlabel('Page rank Pr')
    ax.set_ylabel('P(x>=Pr)')
    fig.tight_layout()
    fig.savefig('output/wikipedia-prank-ccdf.pdf')

    plt.clf()

    kcore = network.vertex_properties["kcore"]

    #hist, bin_edges = np.histogram(kcore.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)
    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Kcore kC')
    #plt.ylabel('P(x<=kC)')
    #plt.title('K-Core Distribution')
    #plt.savefig('output/wikipedia-kcore-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(kcore.get_array(), ax)
    #ax.set_title('K-Core Distribution')
    ax.set_xlabel('k-Core kC')
    ax.set_ylabel('P(x<=kC)')
    ax.set_ylim([0, 1])
    fig.tight_layout()
    fig.savefig('output/wikipedia-kcore-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(kcore.get_array(), ax)
    #ax.set_title('K-Core Distribution')
    ax.set_xlabel('k-Core kC')
    ax.set_ylabel('P(x>=kC)')
    fig.tight_layout()
    fig.savefig('output/wikipedia-kcore-ccdf.pdf')

    plt.clf()

    eigenvector_centr = network.vertex_properties["eigenvector_centr"]

    #hist, bin_edges = np.histogram(eigenvector_centr.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)
    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Eigenvector Centrality E')
    #plt.ylabel('P(x<=E)')
    #plt.title('Eigenvector Centrality Distribution')
    #plt.savefig('output/wikipedia-eigenvcentr-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(eigenvector_centr.get_array(), ax)
    #ax.set_title('Eigenvector Centrality E')
    ax.set_xlabel('Eigenvector Centrality E')
    ax.set_ylabel('P(x<=E)')
    ax.set_ylim([0, 1])
    fig.tight_layout()
    fig.savefig('output/wikipedia-eigenvcentr-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(eigenvector_centr.get_array(), ax)
    #ax.set_title('Eigenvector Centrality E')
    ax.set_xlabel('Eigenvector Centrality E')
    ax.set_ylabel('P(x>=E)')
    fig.tight_layout()
    fig.savefig('output/wikipedia-eigenvcentr-ccdf.pdf')

    plt.clf()

    colors = {
        'local_clust': 'r',
        'eigenvector_centr': 'b',
        'page_rank': 'g',
        'kcore': 'm',
        'hub': 'c',
        'authority': 'k'
    }
    labels = {
        'local_clust': 'clust.',
        'eigenvector_centr': 'eigen. centr.',
        'page_rank': 'page rank',
        'kcore': 'kcore',
        'hub': 'hub',
        'authority': 'authority'
    }
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for f in ['local_clust', 'page_rank', 'hub', 'authority', 'kcore']:
        feature = network.vertex_properties[f]
        powerlaw.plot_cdf(feature.get_array(),
                          ax,
                          label=labels[f],
                          color=colors[f])
    ax.set_xlabel('Feature $f$')
    ax.set_ylabel('$P(X>=f)$')
    ax.set_ylim([0, 1])
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size': 4})
    plt.tight_layout()
    plt.savefig('output/wikipedia-features-cdf.pdf')

    plt.clf()
    colors = {
        'local_clust': 'r',
        'eigenvector_centr': 'b',
        'page_rank': 'g',
        'kcore': 'm',
        'hub': 'c',
        'authority': 'k'
    }
    labels = {
        'local_clust': 'clust.',
        'eigenvector_centr': 'eigen. centr.',
        'page_rank': 'page rank',
        'kcore': 'kcore',
        'hub': 'hub',
        'authority': 'authority'
    }
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for f in [
            'local_clust', 'eigenvector_centr', 'page_rank', 'hub',
            'authority', 'kcore'
    ]:
        feature = network.vertex_properties[f]
        powerlaw.plot_cdf(feature.get_array(),
                          ax,
                          label=labels[f],
                          color=colors[f])
    ax.set_xlabel('Feature $f$')
    ax.set_ylabel('$P(X<=f)$')
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size': 4})
    plt.tight_layout()
    plt.savefig('output/wikipedia-features-ccdf.pdf')

    plt.clf()

    # wikipedia transitions  graph  structural statistics
    print 'before load'
    network_transitions = load_graph("output/transitionsnetwork.xml.gz")
    print 'after load'

    out_hist = vertex_hist(network_transitions, "out")
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(out_hist[1][:-1], out_hist[0], marker='o')
    plt.xlabel('Out-degree')
    plt.ylabel('Frequency')
    plt.gca().set_ylim([1, 10**6])
    #plt.title('Out-degree Distribution')
    plt.savefig('output/wikipedia-transitions-out-deg-dist.pdf')

    plt.clf()

    in_hist = vertex_hist(network_transitions, "in")
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(in_hist[1][:-1], in_hist[0], marker='o')
    plt.xlabel('In-degree')
    plt.ylabel('Frequency')
    #plt.title('In-degree Distribution')
    plt.gca().set_ylim([1, 10**6])
    plt.savefig('output/wikipedia-transitions-in-deg-dist.pdf')

    plt.clf()

    total_hist = vertex_hist(network_transitions, "total")

    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.plot(total_hist[1][:-1], total_hist[0], marker='o')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')
    #plt.title('Degree Distribution')
    plt.gca().set_ylim([1, 10**6])
    plt.savefig('output/wikipedia-transitions-deg-dist.pdf')

    plt.clf()

    #clust = local_clustering(network_transitions, undirected=False)
    clust = network_transitions.vertex_properties["local_clust"]

    #hist, bin_edges = np.histogram(clust.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)
    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Local Clustering Coefficient C')
    #plt.ylabel('P(x<=C)')
    #plt.title('Clustering Coefficient Distribution')
    #plt.savefig('output/wikipedia-transitions-clust-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(clust.get_array(), ax)
    #ax.set_title('Clustering Coefficient Distribution')
    ax.set_xlabel('Local Clustering Coefficient C')
    ax.set_ylabel('P(x<=C)')
    fig.savefig('output/wikipedia-transitions-clust-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(clust.get_array(), ax)
    ax.set_title('Clustering Coefficient Distribution')
    ax.set_xlabel('Local Clustering Coefficient C')
    ax.set_ylabel('P(x>=C)')
    fig.savefig('output/wikipedia-transitions-clust-ccdf.pdf')

    plt.clf()

    prank = network_transitions.vertex_properties["page_rank"]

    #hist, bin_edges = np.histogram(prank.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)
    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Page rank Pr')
    #plt.ylabel('P(x<=Pr)')
    #plt.title('Page rank Distribution')
    #plt.savefig('output/wikipedia-transitions-prank-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(prank.get_array(), ax)
    #ax.set_title('Page Rank Distribution')
    ax.set_xlabel('Page rank Pr')
    ax.set_ylabel('P(x<=Pr)')
    fig.savefig('output/wikipedia-transitions-prank-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(prank.get_array(), ax)
    #ax.set_title('Page Rank Distribution')
    ax.set_xlabel('Page rank Pr')
    ax.set_ylabel('P(x>=Pr)')
    fig.savefig('output/wikipedia-transitions-prank-ccdf.pdf')

    plt.clf()

    kcore = network_transitions.vertex_properties["kcore"]

    #hist, bin_edges = np.histogram(kcore.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)

    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Kcore kC')
    #plt.ylabel('P(x<=kC)')
    #plt.title('K-Core Distribution')
    #plt.savefig('output/wikipedia-transitions-kcore-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(kcore.get_array(), ax)
    #ax.set_title('K-Core Distribution')
    ax.set_xlabel('k-Core kC')
    ax.set_ylabel('P(x<=kC)')
    fig.savefig('output/wikipedia-transitions-kcore-cdf.pdf')

    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(kcore.get_array(), ax)
    #ax.set_title('K-Core Distribution')
    ax.set_xlabel('k-Core kC')
    ax.set_ylabel('P(x>=kC)')
    fig.savefig('output/wikipedia-transitions-kcore-ccdf.pdf')

    plt.clf()

    eigenvector_centr = network_transitions.vertex_properties[
        "eigenvector_centr"]

    #hist, bin_edges = np.histogram(eigenvector_centr.get_array(), 100, density=True)
    #cdf = np.cumsum(hist)

    #plt.plot(bin_edges[1:], cdf, marker='o')
    #plt.xlabel('Eingenvector centrality E')
    #plt.ylabel('P(x<=E)')
    #plt.title('Eigenvector Centrality Distribution')
    #plt.savefig('output/wikipedia-transitions-eigenvcentr-cdf.pdf')

    fig, ax = plt.subplots()
    powerlaw.plot_cdf(eigenvector_centr.get_array(), ax)
    #ax.set_title('Eigenvector Centrality Distribution')
    ax.set_xlabel('Eingenvector centrality E')
    ax.set_ylabel('P(x<=E)')
    fig.savefig('output/wikipedia-transitions-eigenvcentr-cdf.pdf')
    plt.clf()

    fig, ax = plt.subplots()
    powerlaw.plot_ccdf(eigenvector_centr.get_array(), ax)
    #ax.set_title('Eigenvector Centrality Distribution')
    ax.set_xlabel('Eingenvector centrality E')
    ax.set_ylabel('P(x>=E)')
    fig.savefig('output/wikipedia-transitions-eigenvcentr-ccdf.pdf')
    plt.clf()

    print 'before hits'
    #ee, authority, hub = hits(network_transitions)
    #network_transitions.vertex_properties["authority"] = authority
    #network_transitions.vertex_properties["hub"] = hub
    #network_transitions.save("output/transitionsnetwork.xml.gz")
    print 'after hits'

    colors = {
        'local_clust': 'r',
        'eigenvector_centr': 'b',
        'page_rank': 'g',
        'kcore': 'm',
        'hub': 'c',
        'authority': 'k'
    }
    labels = {
        'local_clust': 'clust.',
        'eigenvector_centr': 'eigen. centr.',
        'page_rank': 'page rank',
        'kcore': 'kcore',
        'hub': 'hub',
        'authority': 'authority'
    }
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for f in ['local_clust', 'page_rank', 'hub', 'authority', 'kcore']:
        feature = network_transitions.vertex_properties[f]
        powerlaw.plot_cdf(feature.get_array(),
                          ax,
                          label=labels[f],
                          color=colors[f])
    ax.set_xlabel('Feature $f$')
    ax.set_ylabel('$P(X>=f)$')
    ax.set_ylim([0, 1])
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size': 4})
    plt.tight_layout()
    plt.savefig('output/wikipedia-transitions-features-cdf.pdf')
    plt.clf()

    colors = {
        'local_clust': 'r',
        'eigenvector_centr': 'b',
        'page_rank': 'g',
        'kcore': 'm',
        'hub': 'c',
        'authority': 'k'
    }
    labels = {
        'local_clust': 'clust.',
        'eigenvector_centr': 'eigen. centr.',
        'page_rank': 'page rank',
        'kcore': 'kcore',
        'hub': 'hub',
        'authority': 'authority'
    }
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for f in ['local_clust', 'page_rank', 'hub', 'authority', 'kcore']:
        feature = network_transitions.vertex_properties[f]
        powerlaw.plot_cdf(feature.get_array(),
                          ax,
                          label=labels[f],
                          color=colors[f])
    ax.set_xlabel('Feature $f$')
    ax.set_ylabel('$P(X<=f)$')
    plt.legend(fancybox=True, loc=3, ncol=2, prop={'size': 4})
    plt.tight_layout()
    plt.savefig('output/wikipedia-transitions-features-ccdf.pdf')

    plt.clf()
Esempio n. 12
0
# Plot adjacency matrix indexed by locations

ind1, ind2 = np.nonzero(np.triu(Z1new,
                                1))  # returns indices of non zero elements
fig, ax = plt.subplots()
ax.plot(x1[ind1], x1[ind2], 'b.', x1[ind2], x1[ind1], 'b.')
ax.set(xlabel='x_i', ylabel='x_j', title='Adjacency matrix')

# Plot degree distribution.

deg = deg[ind]
a = np.sum(deg <= 100)
fit = pl.Fit(np.sort(deg)[0:a],
             discrete=True)  # fit power law to low degrees? very empirical
figCCDF = pl.plot_ccdf(deg, label='alpha=10')
figCCDF.set(xlabel='degree',
            ylabel='distribution',
            title='Double power law degree distribution')

# TO DO: add lines of power law. This does not work.

y = np.linspace(1, 100, 100)
plt.plot(y, y**(-sigma))
y = np.linspace(100, 1000, 1000)
plt.plot(y, y**(-tau))
#fit.plot_ccdf(color='r', linewidth=2, ax=figCCDF)
#fit.power_law.plot_ccdf(color='r', linestyle='--', ax=figCCDF)

# second way: with Poisson
# accept = (np.random.poisson(XYw / (1 + XY ** beta)) > 0)
Esempio n. 13
0
		graph = nx.configuration_model(sequence)
		loops = graph.selfloop_edges()
		graph = nx.Graph(graph)
		graph.remove_edges_from(loops)
		Gcc=sorted(nx.connected_component_subgraphs(graph), key = len, reverse=True)
		G = Gcc[0]

		degrees = nx.degree(G)
		#powerlaw.plot_ccdf(degrees.values(),color='b',marker ='o',linestyle = "none")
		#plt.show()

		N_new.append(len(G))
		print len(G)
		data = G.degree().values()
		powerlaw.plot_ccdf(data,color = color.pop(),marker = 'o',label = str(n))
		fit = powerlaw.Fit(G.degree().values())
		print fit.power_law.alpha, fit.power_law.sigma
		plt.show()
		node_list = G.nodes()
		WalkerNum = 300
		T = []
		walker = 0
		#for walker in range(WalkerNum):
		while len(T)<=50000:
			#if walker<=1000:
			walker += 1
			print walker
			# if walker%100==0:
			# 	print walker
			source,target = random.sample(node_list,2)
Esempio n. 14
0
#np_edges = T.get_n_edge_lists(500)

for meas in range(N_meas):
    edges = get_fast_edge_list(N, covariance, t)
    ks = get_degrees_from_edge_list(N, edges).tolist()
    k1.extend(ks)

k1 = np.array(k1, dtype=int)
k1pos = k1[k1 >= 1]

import powerlaw
results = powerlaw.Fit(k1pos, discrete=True, xmin=1)

fig = pl.figure()
powerlaw.plot_ccdf(k1pos)
#powerlaw.plot_pdf(k1)
#pl.hist(k1,bins=np.arange(1,max(k1)+1),histtype='step',density=True)
x = np.arange(1, max(k1pos))
results.lognormal.plot_ccdf(ax=pl.gca())
#results.lognormal.plot_pdf(ax=pl.gca())

pl.xscale('log')
pl.yscale('log')

fig = pl.figure()
pl.hist(
    k1,
    bins=np.arange(max(k1) + 1),
    histtype='step',
    density=True,
histplot(indian_6_fork, binsize, 'Users', 'Fork count', 'green',
         'User Jan-Jun 2019 Forks Received by Count, LogLog Scale Plot')
plt.savefig('logscale_jan-jun_forks_received.png')
plt.close()

powerlaw.plot_pdf(russian_followers_all, color='black')
powerlaw.plot_pdf(chinese_followers_all, color='red')
powerlaw.plot_pdf(american_followers_all, color='blue')
powerlaw.plot_pdf(indian_followers_all, color='green')
plt.ylabel('Users')
plt.xlabel('Follow Count')
plt.title('All Follows Received by Count, PDF')
plt.savefig('pdf_all_follows_received.png')
plt.close()

powerlaw.plot_ccdf(russian_followers_all, color='black')
powerlaw.plot_ccdf(chinese_followers_all, color='red')
powerlaw.plot_ccdf(american_followers_all, color='blue')
powerlaw.plot_ccdf(indian_followers_all, color='green')
plt.ylabel('Users')
plt.xlabel('Follow Count')
plt.title('All Follows Received by Count, CCDF')
plt.savefig('ccdf_all_follows_received.png')
plt.close()

powerlaw.plot_pdf(russian_watchers_all, color='black')
powerlaw.plot_pdf(chinese_watchers_all, color='red')
powerlaw.plot_pdf(american_watchers_all, color='blue')
powerlaw.plot_pdf(indian_watchers_all, color='green')
plt.ylabel('Users')
plt.xlabel('Star Count')
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    target_day_indices = [0, 15, 30, 45]
    color_cycle_4 = ColorPalette.CC4
    date_labels = [
        'Sep 01, 2018', 'Sep 16, 2018', 'Oct 01, 2018', 'Oct 16, 2018'
    ]

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    num_videos = data_loader.num_videos

    target_day_view_list = [[], [], [], []]
    for embed in range(num_videos):
        for target_idx, target_day in enumerate(target_day_indices):
            target_day_view_list[target_idx].append(
                embed_view_dict[embed][target_day])

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    embed_indegree_dict = {
        embed: np.zeros((T, ))
        for embed in np.arange(num_videos)
    }  # daily indegree for each embed
    zero_indegree_list = []  # percentage of zero indegree for each day
    num_edges_list = []  # number of total edges for each day
    for t in range(T):
        filename = 'network_{0}.p'.format(
            (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d'))
        indegree_list = []
        with open(os.path.join(data_prefix, 'network_pickle', filename),
                  'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src), ...]
            for tar_embed in range(num_videos):
                indegree_value = len(
                    [1 for x in network_dict[tar_embed] if x[1] < NUM_REL])
                embed_indegree_dict[tar_embed][t] = indegree_value
                indegree_list.append(indegree_value)
        indegree_counter = Counter(indegree_list)
        zero_indegree_list.append(indegree_counter[0] / num_videos)
        num_edges_list.append(sum(indegree_list))
        print('>>> Finish loading day {0}...'.format(t + 1))
    print('>>> Network structure has been loaded!')
    print('\n>>> Average number of edges: {0:.0f}, max: {1:.0f}, min: {2:.0f}'.
          format(
              sum(num_edges_list) / len(num_edges_list), max(num_edges_list),
              min(num_edges_list)))

    fig, axes = plt.subplots(1, 3, figsize=(12, 4.5))
    ax1, ax2, ax3 = axes.ravel()

    # == == == == == == Part 4: Plot ax1 indegree CCDF == == == == == == #
    embed_avg_indegree_dict = defaultdict(float)
    for t in range(T):
        for embed in range(num_videos):
            embed_avg_indegree_dict[embed] += embed_indegree_dict[embed][t] / T

    indegree_ranked_embed_list = [
        x[0] for x in sorted(embed_avg_indegree_dict.items(),
                             key=lambda kv: kv[1],
                             reverse=True)
    ]
    top_20_indegree_embeds = indegree_ranked_embed_list[:20]
    popular_ranked_embed_list = [
        x[0] for x in sorted(
            embed_avg_view_dict.items(), key=lambda kv: kv[1], reverse=True)
    ]
    top_20_popular_embeds = popular_ranked_embed_list[:20]

    for target_idx, target_day in enumerate(target_day_indices):
        indegree_list = []
        for embed in range(num_videos):
            indegree_list.append(embed_indegree_dict[embed][target_day])

        print(
            'video with 10 indegree has more in-links than {0:.2f}% videos on date {1}'
            .format(percentileofscore(indegree_list, 10),
                    date_labels[target_idx]))
        print(
            'video with 20 indegree has more in-links than {0:.2f}% videos on date {1}'
            .format(percentileofscore(indegree_list, 20),
                    date_labels[target_idx]))

        plot_ccdf(indegree_list,
                  ax=ax1,
                  color=color_cycle_4[target_idx],
                  label=date_labels[target_idx])

    # compute the powerlaw fit
    powerlaw_fit = Fit(list(embed_avg_indegree_dict.values()))
    infer_alpha = powerlaw_fit.power_law.alpha
    p = powerlaw_fit.power_law.ccdf()
    ins_x_axis = powerlaw_fit.power_law.__dict__['parent_Fit'].__dict__[
        'data'][:int(0.9 * len(p))]
    ins_y_axis = 0.1 * p[:int(0.9 * len(p))]

    ax1.plot(ins_x_axis, ins_y_axis, 'k:')
    ax1.text(0.4,
             0.6,
             r'$x^{{{0:.2f}}}$'.format(-infer_alpha + 1),
             size=12,
             ha='right',
             va='bottom',
             transform=ax1.transAxes)
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.set_xlabel('indegree', fontsize=11)
    ax1.set_ylabel('$P(X) \geq x$', fontsize=11)
    ax1.tick_params(axis='both', which='major', labelsize=10)
    ax1.set_title('(a) indegree distribution', fontsize=12)

    ax1.legend(frameon=False, fontsize=11, ncol=1, fancybox=False, shadow=True)

    mean_zero_indegree = sum(zero_indegree_list) / len(zero_indegree_list)

    ax1.axhline(y=1 - mean_zero_indegree, color='k', linestyle='--', zorder=30)
    ax1.text(0.96,
             0.9,
             '{0:.0f}% with 0 indegree'.format(mean_zero_indegree * 100),
             size=11,
             transform=ax1.transAxes,
             ha='right',
             va='top')

    # == == == == == == Part 5: Plot ax2 views distribution == == == == == == #
    for target_idx, views_list in enumerate(target_day_view_list):
        x_values = range(100)
        y_values = [np.percentile(views_list, x) for x in x_values]
        ax2.plot(x_values,
                 y_values,
                 color=color_cycle_4[target_idx],
                 label=date_labels[target_idx])
    ax2.set_yscale('log')
    ax2.set_xlabel('views percentile', fontsize=11)
    ax2.set_ylabel('num of views', fontsize=11)
    ax2.tick_params(axis='both', which='major', labelsize=10)
    ax2.set_title('(b) daily views vs. its percentile', fontsize=12)

    avg_views_list = sorted(list(embed_avg_view_dict.values()), reverse=True)
    gini_coef = gini(avg_views_list)
    print('top 1% videos occupy {0:.2f}% views'.format(
        sum(avg_views_list[:int(0.01 * num_videos)]) / sum(avg_views_list) *
        100))
    print('top 10% videos occupy {0:.2f}% views'.format(
        sum(avg_views_list[:int(0.1 * num_videos)]) / sum(avg_views_list) *
        100))
    print('Gini coef: {0:.3f}'.format(gini_coef))

    spearman_degree = [
        embed_avg_indegree_dict[embed] for embed in range(num_videos)
    ]
    spearman_views = [
        embed_avg_view_dict[embed] for embed in range(num_videos)
    ]

    print(
        'Spearman correlation between views and indegree: {0:.4f}, pvalue: {1:.2f}'
        .format(*spearmanr(spearman_views, spearman_degree)))

    median_views = np.median(avg_views_list)
    top_views_90th = np.percentile(avg_views_list, 90)
    top_views_99th = np.percentile(avg_views_list, 99)
    ax2_xmin = ax2.get_xlim()[0]
    ax2_ymin = ax2.get_ylim()[0]

    ax2.plot((50, 50), (ax2_ymin, median_views),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 50), (median_views, median_views),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.49,
             0.45,
             'median views {0:,.0f}'.format(median_views),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    ax2.plot((90, 90), (ax2_ymin, top_views_90th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 90), (top_views_90th, top_views_90th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.88,
             0.75,
             '90th views {0:,.0f}'.format(top_views_90th),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    ax2.plot((99, 99), (ax2_ymin, top_views_99th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 99), (top_views_99th, top_views_99th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.91,
             0.95,
             '99th views {0:,.0f}'.format(top_views_99th),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    # == == == == == == Part 7: Plot ax3 video uploading trend == == == == == == #
    x_axis = range(2009, 2018)
    x_labels = ["'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16", "'17"]
    upload_mat = np.zeros((len(x_axis), 8))

    target_topics = [
        'Pop_music', 'Rock_music', 'Hip_hop_music', 'Independent_music',
        'Country_music', 'Electronic_music', 'Soul_music', 'Others'
    ]
    topic_labels = [
        'Pop', 'Rock', 'Hip hop', 'Independent', 'Country', 'Electronic',
        'Soul', 'Others'
    ]

    color_cycle_8 = ColorPalette.CC8

    data_loader.load_embed_content_dict()
    embed_title_dict = data_loader.embed_title_dict
    embed_uploadtime_dict = data_loader.embed_uploadtime_dict
    embed_genre_dict = data_loader.embed_genre_dict

    for embed in range(num_videos):
        upload_year = int(embed_uploadtime_dict[embed][:4])
        if 2009 <= upload_year <= 2017:
            year_idx = upload_year - 2009

            genres = embed_genre_dict[embed]
            if len(genres) == 0:
                # add one to "Others" genre
                upload_mat[year_idx, 7] += 1
            else:
                for genre in genres:
                    upload_mat[year_idx,
                               target_topics.index(genre)] += 1 / len(genres)

    print()
    print([
        '{0}: {1}'.format(topic, int(num))
        for topic, num in zip(target_topics, np.sum(upload_mat, axis=0))
    ])

    stackedBarPlot(ax=ax3,
                   data=upload_mat,
                   cols=color_cycle_8,
                   edgeCols=['#000000'] * 8,
                   xlabel='uploaded year',
                   ylabel='num of videos',
                   scale=False,
                   endGaps=True)

    ax3.tick_params(axis='both', which='major', labelsize=9)
    ax3.set_xticks(np.arange(len(x_axis)))
    ax3.set_xticklabels(x_labels)
    ax3.yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    ax3.legend([
        plt.Rectangle((0, 0), 1, 1, fc=c, ec='k', alpha=0.6)
        for c in color_cycle_8
    ],
               topic_labels,
               fontsize=9,
               frameon=False,
               handletextpad=0.2,
               columnspacing=0.3,
               ncol=4,
               bbox_to_anchor=(1, -0.12),
               bbox_transform=ax3.transAxes,
               fancybox=False,
               shadow=True)
    ax3.set_title('(c) VEVO videos uploading trend', fontsize=12)

    union_top_set = set(top_20_indegree_embeds).union(top_20_popular_embeds)
    print('\n>>> Size of the union set at cutoff 15:', len(union_top_set))
    print('{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'.
          format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views',
                 '-rank'))
    for embed in top_20_indegree_embeds:
        print(
            '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\'
            .format(
                embed_title_dict[embed].split(
                    ' - ', 1)[1].split('(')[0].split('ft')[0].strip(),
                embed_title_dict[embed].split(
                    ' - ',
                    1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format(
                        (datetime(2018, 11, 2) -
                         str2obj(embed_uploadtime_dict[embed])).days),
                '{0:,}'.format(int(embed_avg_indegree_dict[embed])),
                '{0:,}'.format(top_20_indegree_embeds.index(embed) + 1),
                '{0:,}'.format(int(embed_avg_view_dict[embed])),
                '{0:,}'.format(popular_ranked_embed_list.index(embed) + 1)))

    print('\n{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'.
          format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views',
                 '-rank'))
    for embed in top_20_popular_embeds:
        print(
            '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\'
            .format(
                embed_title_dict[embed].split(
                    ' - ', 1)[1].split('(')[0].split('ft')[0].strip(),
                embed_title_dict[embed].split(
                    ' - ',
                    1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format(
                        (datetime(2018, 11, 2) -
                         str2obj(embed_uploadtime_dict[embed])).days),
                '{0:,}'.format(int(embed_avg_indegree_dict[embed])),
                '{0:,}'.format(indegree_ranked_embed_list.index(embed) + 1),
                '{0:,}'.format(int(embed_avg_view_dict[embed])),
                '{0:,}'.format(top_20_popular_embeds.index(embed) + 1)))

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/measure_basic_statistics.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
    print 'Mu = ', fit.lognormal.mu
    print 'Sigma = ', fit.lognormal.sigma

step_data_xmin = [i for i in step_data if i > fit.power_law.xmin]

figure()
powerlaw.plot_pdf(step_data_xmin, color = 'b', linewidth = 2) # PDF of data
fit.power_law.plot_pdf(color = 'b', linestyle = '--') # PL theoretical fit
fit.exponential.plot_pdf(color = 'r', linestyle = '--') # EXP theoretical fit
fit.lognormal.plot_pdf(color = 'g', linestyle = '--') # LN theoretical fit
xlabel('Step Length, x [cm]')
ylabel('P(x)')
plt.legend(('Data', 'Power Law Fit', 'Exponential Fit', 'Lognormal Fit'))

figure()
powerlaw.plot_ccdf(step_data_xmin, color = 'b', linewidth = 2) # PDF of data
fit.power_law.plot_ccdf(color = 'b', linestyle = '--') # PL theoretical fit
fit.exponential.plot_ccdf(color = 'r', linestyle = '--') # EXP theoretical fit
fit.lognormal.plot_ccdf(color = 'g', linestyle = '--') # LN theoretical fit
xlabel('Step Length, x [cm]')
ylabel('P(x)')
plt.legend(('Data', 'Power Law Fit', 'Exponential Fit', 'Lognormal Fit'))

##print '\nCompare PL with TRUNCATED PL:'
##
##R1, p1 = fit.distribution_compare('power_law', 'truncated_power_law')
##
##if R1 > 0:
##    print 'Power law more likely for data.  R = ', R1, ' and p = ', p1
##else:
##    print 'Truncated PL more likely for data. R = ', R1, 'and p = ', p1
                                    bins=np.linspace(np.log10(np.min(grado)),
                                                     np.log10(np.max(grado)),
                                                     15))

#Ajuste a ley de potencias, vease: https://github.com/jeffalstott/powerlaw
ajuste = powerlaw.Fit(grado, xmin=1.0)
print(ajuste.power_law.alpha)
print(ajuste.power_law.xmin)
R, p = ajuste.distribution_compare('power_law', 'lognormal')

#El alpha de la ley de potencias esta dado por ajuste.power_law.alpha

#Plotting
fig = plt.figure(figsize=(15, 10))
#plt.suptitle('Histogramas Datos Newman', fontsize=22)
#plt.subplot(2, 2, 1)
powerlaw.plot_pdf(grado, color='b')
ajuste.power_law.plot_pdf(color='b', linestyle='--')
#plt.subplot(2, 2, 2)
#powerlaw.plot_cdf(grado, color='b')
#plt.subplot(2, 2, 3)
powerlaw.plot_ccdf(grado, color='r')
#plt.subplot(2, 2, 4)
A = np.diff(bines_log)
ydata = np.divide(datos_log + 0.0, np.amax(datos_log))
print ydata
plt.loglog(np.diff(bines_log), ydata, 'o')
#plt.plot(np.diff(bines_log)*ajuste.power_law.alpha,datos_log,'-')
plt.show()
#plt.savefig('maps.png', dpi=300)
Esempio n. 19
0
legend( legend_refs[::-1], theoretical_alphas[::-1], loc = 'center right', bbox_to_anchor = (.1,0,1,1),
            bbox_transform = plt.gcf().transFigure, title=r'$\alpha$ of Data' )
savefig('Fig_powerlaw_validation_%itrials_%idata.pdf'%(int(n_trials),int(n_data)), bbox_inches='tight')

# <markdowncell>

# # Validation of Simulated Data Generators for Other Distributions

# <codecell>

param = [2.5, .5]
dist = powerlaw.Truncated_Power_Law
theoretical_dist = dist(xmin=2.0, parameters=param,discrete=True)

simulated_data = theoretical_dist.generate_random(1000)
powerlaw.plot_ccdf(simulated_data, linewidth=2, linestyle='--')
theoretical_dist.plot_ccdf(simulated_data)
figure()
powerlaw.plot_pdf(simulated_data, linewidth=2, linestyle='--')
theoretical_dist.plot_pdf(simulated_data)

theoretical_dist = dist(xmin=2.0, parameters=param,discrete=False)

figure()
simulated_data = theoretical_dist.generate_random(1000)
powerlaw.plot_ccdf(simulated_data, linewidth=2, linestyle='--')
theoretical_dist.plot_ccdf(simulated_data)
figure()
powerlaw.plot_pdf(simulated_data, linewidth=2, linestyle='--')
theoretical_dist.plot_pdf(simulated_data)
Esempio n. 20
0
binsize = int(np.max(indian_all)/multiplier)
histplot(indian_6, binsize, 'Users', 'Star Count', 'green', 'All Stars Given by Count, LogLog Scale Plot')
plt.savefig('logscale_all_stars_given.png')
plt.close()

powerlaw.plot_pdf(russian_6, color='black')
powerlaw.plot_pdf(chinese_6, color='red')
powerlaw.plot_pdf(american_6, color='blue')
powerlaw.plot_pdf(indian_6, color='green')
plt.ylabel('Users')
plt.xlabel('Star Count')
plt.title('Jan-Jun 2019 Stars Given by Count, PDF')
plt.savefig('pdf_jan-jun_stars_given.png')
plt.close()

powerlaw.plot_ccdf(russian_6, color='black')
powerlaw.plot_ccdf(chinese_6, color='red')
powerlaw.plot_ccdf(american_6, color='blue')
powerlaw.plot_ccdf(indian_6, color='green')
plt.ylabel('Users')
plt.xlabel('Star Count')
plt.title('Jan-Jun 2019 Stars Given by Count, CCDF')
plt.savefig('ccdf_jan-jun_stars_given.png')
plt.close()

powerlaw.plot_pdf(russian_all, color='black')
powerlaw.plot_pdf(chinese_all, color='red')
powerlaw.plot_pdf(american_all, color='blue')
powerlaw.plot_pdf(indian_all, color='green')
plt.ylabel('Users')
plt.xlabel('Star Count')
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'
    archive_dir = '../data/{0}_out'.format(app_name)
    entities = ['user', 'hashtag']
    rho = 0.5272

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]

    for ax_idx, entity in enumerate(entities):
        sample_datefile = open(os.path.join(
            archive_dir, '{0}_{1}_all.txt'.format(entity, app_name)),
                               'r',
                               encoding='utf-8')
        complete_datefile = open(os.path.join(
            archive_dir, 'complete_{0}_{1}.txt'.format(entity, app_name)),
                                 'r',
                                 encoding='utf-8')

        sample_entity_freq_dict = defaultdict(int)
        complete_entity_freq_dict = defaultdict(int)
        uni_random_entity_freq_dict = defaultdict(int)

        if entity == 'user':
            for line in sample_datefile:
                sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1
            for line in complete_datefile:
                complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1
                toss = np.random.random_sample()
                if toss <= rho:
                    uni_random_entity_freq_dict[line.rstrip().split(',')
                                                [1]] += 1
        else:
            for line in sample_datefile:
                for item in line.rstrip().split(',')[1:]:
                    sample_entity_freq_dict[item.lower()] += 1
            for line in complete_datefile:
                for item in line.rstrip().split(',')[1:]:
                    complete_entity_freq_dict[item.lower()] += 1
                toss = np.random.random_sample()
                if toss <= rho:
                    for item in line.rstrip().split(',')[1:]:
                        uni_random_entity_freq_dict[item.lower()] += 1

        sample_datefile.close()
        complete_datefile.close()

        # compute the powerlaw fit in the complete set
        complete_freq_list = list(complete_entity_freq_dict.values())
        complete_powerlaw_fit = Fit(complete_freq_list)
        complete_alpha = complete_powerlaw_fit.power_law.alpha
        complete_xmin = complete_powerlaw_fit.power_law.xmin
        print('{0} complete set alpha {1}, xmin {2}'.format(
            entity, complete_alpha, complete_xmin))
        plot_ccdf(complete_freq_list,
                  ax=axes[ax_idx],
                  color='k',
                  ls='-',
                  label='complete')

        # compute the powerlaw fit in the sample set
        # infer the number of missing entities
        sample_freq_list = list(sample_entity_freq_dict.values())
        sample_freq_counter = Counter(sample_freq_list)

        # we observe the frequency of entities appearing less than 100 times
        num_interest = 100
        sample_freq_list_top100 = [0] * num_interest
        for freq in range(1, num_interest + 1):
            sample_freq_list_top100[freq - 1] = sample_freq_counter[freq]

        inferred_num_missing = infer_missing_num(sample_freq_list_top100,
                                                 rho=rho,
                                                 m=num_interest)
        corrected_sample_freq_list = sample_freq_list + [
            0
        ] * inferred_num_missing
        sample_powerlaw_fit = Fit(corrected_sample_freq_list)
        sample_alpha = sample_powerlaw_fit.power_law.alpha
        sample_xmin = sample_powerlaw_fit.power_law.xmin
        print('{0} sample set alpha {1}, xmin {2}'.format(
            entity, sample_alpha, sample_xmin))
        plot_ccdf(corrected_sample_freq_list,
                  ax=axes[ax_idx],
                  color=blue,
                  ls='-',
                  label='sample')

        # compute the powerlaw fit in uniform random sample
        uni_random_num_missing = len(complete_entity_freq_dict) - len(
            uni_random_entity_freq_dict)
        uni_random_freq_list = list(uni_random_entity_freq_dict.values())
        uni_random_freq_list = uni_random_freq_list + [
            0
        ] * uni_random_num_missing
        uni_random_powerlaw_fit = Fit(uni_random_freq_list)
        uni_random_alpha = uni_random_powerlaw_fit.power_law.alpha
        uni_random_xmin = uni_random_powerlaw_fit.power_law.xmin
        print('{0} uniform random sampling alpha {1}, xmin {2}'.format(
            entity, uni_random_alpha, uni_random_xmin))
        plot_ccdf(uni_random_freq_list,
                  ax=axes[ax_idx],
                  color='k',
                  ls='--',
                  label='uniform random')

        print('inferred missing', inferred_num_missing)
        print('empirical missing',
              len(complete_entity_freq_dict) - len(sample_entity_freq_dict))
        print('uniform random missing', uni_random_num_missing)

        print('KS test (sample, uniform)')
        print(stats.ks_2samp(corrected_sample_freq_list, uni_random_freq_list))

        print('KS test (sample, complete)')
        print(stats.ks_2samp(corrected_sample_freq_list, complete_freq_list))

        print('KS test (uniform, complete)')
        print(stats.ks_2samp(uni_random_freq_list, complete_freq_list))

        axes[ax_idx].set_xscale('symlog')
        axes[ax_idx].set_yscale('log')
        axes[ax_idx].set_xlabel('frequency', fontsize=16)
        axes[ax_idx].tick_params(axis='both', which='major', labelsize=16)

    axes[0].set_xticks([0, 1, 100, 10000])
    axes[0].set_yticks([1, 0.01, 0.0001, 0.000001])
    axes[0].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='lower left')
    axes[0].set_title('(a) user posting', fontsize=18, pad=-3 * 72, y=1.0001)

    axes[1].set_xticks([0, 1, 100, 10000, 1000000])
    axes[1].set_yticks([1, 0.1, 0.001, 0.00001])
    axes[1].set_title('(b) hashtag', fontsize=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/entity_freq_dist.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
Esempio n. 22
0
# stashfig("degree-synapse-sequences")

# %%
from powerlaw import plot_ccdf, plot_cdf, plot_pdf

fig, axs = plt.subplots(4, 5, figsize=(20, 20), sharex=True)
for i in range(len(GRAPH_TYPES)):
    g_type = GRAPH_TYPES[i]
    g_type_label = GRAPH_TYPE_LABELS[i]

    adj = load_everything(g_type, version=BRAIN_VERSION)

    in_sum = np.sort(adj.sum(axis=0))
    out_sum = np.sort(adj.sum(axis=1))

    ax = plot_ccdf(in_sum, ax=axs[0, i])
    # ax.set(yscale="log")
    ax.set_xticklabels([])

    ax = plot_ccdf(out_sum, ax=axs[1, i])
    # ax.set(yscale="log")
    ax.set_xticklabels([])

    in_degree = np.sort(np.count_nonzero(adj, axis=0))
    out_degree = np.sort(np.count_nonzero(adj, axis=1))

    ax = plot_ccdf(in_degree, ax=axs[2, i])
    # ax.set(yscale="log")
    ax.set_xticklabels([])

    ax = plot_ccdf(out_degree, ax=axs[3, i])
import powerlaw
import matplotlib.pyplot as plt 

"""C.Elegan"""
G = nx.read_gml("celegansneural/celegansneural.gml")
G = nx.DiGraph(G)
print "C.Elegan"
print "Number of Nodes: ",len(G)
print "Number of Edges: ",len(G.edges())

degrees = G.degree().values()
in_degrees = G.in_degree().values()
out_degrees = G.out_degree().values()

plt.figure(1)
powerlaw.plot_ccdf(degrees,marker = 'o',color = 'b',linestyle = "none")
plt.xlabel(r"$k$",fontsize = 16)
plt.ylabel(r"$P(k)$",fontsize = 16)
plt.title("Degree Distribution of C.Elegan")
plt.savefig("C.Elegan - Degree Distribution.png")

plt.figure(2)
powerlaw.plot_ccdf(in_degrees,marker = 'o',color = 'b',linestyle = "none")
plt.xlabel(r"$k$",fontsize = 16)
plt.ylabel(r"$P(k)$",fontsize = 16)
plt.title("In Degree Distribution of C.Elegan")
plt.savefig("C.Elegan - In Degree Distribution.png")

plt.figure(3)
powerlaw.plot_ccdf(out_degrees,marker = 'o',color = 'b',linestyle = "none")
plt.xlabel(r"$k$",fontsize = 16)