Exemple #1
0
def plot_powerlaw_combined(data, data_inst, fig, units):
	from powerlaw import plot_pdf, Fit, pdf
	annotate_coord = (-.4, .95)
	ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
	plot_pdf(data, ax=ax1, color='b', linewidth=2)
	
	fit = Fit(data, xmin=1, discrete=True)
	fit.power_law.plot_pdf(ax=ax1, linestyle=':', color='g')
	p = fit.power_law.pdf()

	fit = Fit(data, discrete=True)
	fit.power_law.plot_pdf(ax=ax1, linestyle='--', color='g')

	from pylab import setp
	setp( ax1.get_xticklabels(), visible=False)

	if data_inst==1:
	   ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontsize=14)        
	   ax1.set_ylabel(r"$p(X)$")# (10^n)")

	ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst)#, sharex=ax1)#, sharey=ax2)
	fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
	fit.exponential.plot_pdf(ax=ax2, linestyle='--', color='r')
	fit.plot_pdf(ax=ax2, color='b', linewidth=2)
	
	ax2.set_ylim(ax1.get_ylim())
	ax2.set_yticks(ax2.get_yticks()[::2])
	ax2.set_xlim(ax1.get_xlim())
	
	if data_inst==1:
		ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontsize=14)

	ax2.set_xlabel(units)
Exemple #2
0
def plot_basics(data, data_inst, fig, units):
    '''
    This function is the main plotting function. Adapted from Newman's powerlaw package.
    '''
    import pylab
    pylab.rcParams['xtick.major.pad'] = '8'
    pylab.rcParams['ytick.major.pad'] = '8'
    pylab.rcParams['font.sans-serif'] = 'Arial'

    from matplotlib import rc
    rc('font', family='sans-serif')
    rc('font', size=10.0)
    rc('text', usetex=False)

    from matplotlib.font_manager import FontProperties

    panel_label_font = FontProperties().copy()
    panel_label_font.set_weight("bold")
    panel_label_font.set_size(12.0)
    panel_label_font.set_family("sans-serif")

    n_data = 1
    n_graphs = 4
    from powerlaw import plot_pdf, Fit, pdf
    ax1 = fig.add_subplot(n_graphs, n_data, data_inst)
    x, y = pdf(data, linear_bins=True)
    ind = y > 0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5, label='data')
    plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2, label='PDF')
    from pylab import setp
    setp(ax1.get_xticklabels(), visible=False)
    plt.legend(loc='bestloc')

    ax2 = fig.add_subplot(n_graphs, n_data, n_data + data_inst, sharex=ax1)
    plot_pdf(data[data > 0], ax=ax2, color='b', linewidth=2, label='PDF')
    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g', label='w/o xmin')
    p = fit.power_law.pdf()

    ax2.set_xlim(ax1.get_xlim())
    fit = Fit(data, discrete=True, xmin=3)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g', label='w xmin')
    from pylab import setp
    setp(ax2.get_xticklabels(), visible=False)
    plt.legend(loc='bestloc')

    ax3 = fig.add_subplot(n_graphs, n_data,
                          n_data * 2 + data_inst)  #, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g', label='powerlaw')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r', label='exp')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)

    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())
    plt.legend(loc='bestloc')
    ax3.set_xlabel(units)
def plot_basics(data, data_inst, fig, units):
    from powerlaw import plot_pdf, Fit, pdf
    annotate_coord = (-.4, .95)
    ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
    x, y = pdf(data, linear_bins=True)
    ind = y>0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp( ax1.get_xticklabels(), visible=False)

    if data_inst==1:
        ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font)

    
    from mpl_toolkits.axes_grid.inset_locator import inset_axes
    ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3)
    ax1in.hist(data, normed=True, color='b')
    ax1in.set_xticks([])
    ax1in.set_yticks([])

    
    ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1)
    plot_pdf(data, ax=ax2, color='b', linewidth=2)
    fit = Fit(data, xmin=1, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g')
    p = fit.power_law.pdf()

    ax2.set_xlim(ax1.get_xlim())
    
    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
    from pylab import setp
    setp( ax2.get_xticklabels(), visible=False)

    if data_inst==1:
       ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font)        
       ax2.set_ylabel(u"p(X)")# (10^n)")
        
    ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst)#, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)
    
    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())
    
    if data_inst==1:
        ax3.annotate("C", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font)

    ax3.set_xlabel(units)
Exemple #4
0
    def draw_plots(self):
        from matplotlib import pyplot as plt

        fig = plt.figure(figsize=(4, 4))
        ax = fig.add_subplot(111)
        data = self.on_data()
        from powerlaw import Fit
        experimental = Fit(data, xmin=min(data))
        experimental.plot_ccdf(ax=ax)

        plt.show()
Exemple #5
0
def main():
    """ Computes various graph statistics """

    #Load subgraph here
    #G = nx.read_adjlist("data/sub_graph_networkx_graph")
    #G = G.to_directed()

    #Load graph
    #name = 'data/internal-references-pdftotext.json.gz'
    name = '../../data/internal-references-pdftotext.json.gz'
    q = ia.loaddata(fname=name)
    G = ia.makegraph(q)

    #basic stats
    N_nodes, N_edges = G.number_of_nodes(), G.number_of_edges()

    #Degree
    t1 = time.time()
    in_deg = [d for n, d in G.in_degree()]
    out_deg = [d for n, d in G.out_degree()]
    np.savetxt('../../data/in_degree.txt', in_deg)
    np.savetxt('../../data/out_degree.txt', out_deg)
    mean_k = 2 * np.mean(in_deg)
    t2 = time.time()
    print('degree took ' + str((t2 - t1) / 60.0) + ' mins')

    #Find powerlaw fits
    fit_in, fit_out = Fit(in_deg, xmin=0), Fit(out_deg, xmin=0)
    alpha_in = np.round(fit_in.power_law.alpha, 2)
    xmin_in = np.round(fit_in.power_law.xmin, 2)
    alpha_out = np.round(fit_out.power_law.alpha, 2)
    xmin_out = np.round(fit_out.power_law.xmin, 2)
    print('For power law fitting in-degree: x_min = ' + str(xmin_in))
    print('For power law fitting out-degree: x_min = ' + str(xmin_out) + '\n')

    #Clustering coeff
    t1 = time.time()
    cs = list(nx.clustering(G).values())
    np.savetxt('../../data/clustering_c.txt', cs)
    mean_C = np.round(np.mean(cs), 2)
    t2 = time.time()
    print('cluster coeff took ' + str((t2 - t1) / 60.0) + ' mins')

    #Size-biggest
    t1 = time.time()
    comps = nx.weakly_connected_components(G)
    biggest = max(comps, key=len)
    G_cc = G.subgraph(biggest)
    size_WCC = 1.0 * G_cc.number_of_nodes()
    fraction_WCC = np.round(size_WCC / N_nodes, 2)

    #Num isolated
    num_isolated = 0
    comps = nx.weakly_connected_components(G)
    for cc in comps:
        if len(cc) == 1:
            num_isolated += 1
    fraction_isolated = np.round((1.0 * num_isolated) / N_nodes, 2)

    t2 = time.time()
    print('cluster size dist ' + str((t2 - t1) / 60.0) + ' mins')

    #results
    stats = [
        N_nodes, N_edges, mean_k, alpha_in, alpha_out, mean_C, fraction_WCC,
        fraction_isolated
    ]
    print(stats)

    #Stuff for tables

    OpenArXiv = ['openArXiv']
    OpenArXiv.extend(map(lambda n: '{:.3f}'.format(n), stats))

    # Automatically make table!
    datenow = str(datetime.now()).split()[0]
    with open('graph-stats-{}.tex'.format(datenow), 'w') as fout:
        fout.write(make_latex_table([Header, OpenArXiv, WoS, CiteSeer, ArXiv]))

    #### MAKE FIGURE
    tick_size = 20
    axis_size = 30
    label_size = 28
    label_y_position = 1.10
    inset_size = 18
    plt.figure(figsize=(20, 5))

    #Histogram in-degree
    n_bins = 30
    ax1 = plt.subplot(131)
    plt.hist(in_deg, alpha=0.75, bins=n_bins)
    #plt.hist(out_deg, alpha=0.75,bins=n_bins)
    plt.xlabel('$k_{in}$', fontsize=axis_size)
    plt.xticks(fontsize=tick_size)
    plt.yticks(fontsize=tick_size)
    plt.rc('font', size=15)
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.spines["top"].set_visible(False)
    ax1.spines["right"].set_visible(False)
    ax1.text(-0.025,
             label_y_position,
             'a',
             transform=ax1.transAxes,
             fontsize=label_size,
             fontweight='bold',
             va='top',
             ha='right')
    ax1.text(0.9,
             0.55,
             '',
             transform=ax1.transAxes,
             fontsize=inset_size,
             va='top',
             ha='right')

    #Histogram out-degree
    ax2 = plt.subplot(132)
    plt.hist(out_deg, alpha=0.75, bins=n_bins)
    plt.xlabel('$k_{out}$', fontsize=axis_size)
    plt.xticks(fontsize=tick_size)
    plt.yticks(fontsize=tick_size)
    plt.rc('font', size=15)
    ax2.set_xscale('log')
    ax2.set_yscale('log')
    ax2.spines["top"].set_visible(False)
    ax2.spines["right"].set_visible(False)
    ax2.text(-0.025,
             label_y_position,
             'b',
             transform=ax2.transAxes,
             fontsize=label_size,
             fontweight='bold',
             va='top',
             ha='right')
    ax2.text(0.9,
             0.55,
             '',
             transform=ax1.transAxes,
             fontsize=inset_size,
             va='top',
             ha='right')

    #Histogram clustering coefficients
    ax3 = plt.subplot(133)
    plt.hist(cs, alpha=0.75, bins=n_bins)
    ax3.set_xscale('log')
    ax3.set_yscale('log')
    plt.xlabel('$C$', fontsize=axis_size)
    plt.xticks(fontsize=tick_size)
    plt.yticks(fontsize=tick_size)
    plt.rc('font', size=15)
    ax3.spines["top"].set_visible(False)
    ax3.spines["right"].set_visible(False)
    ax3.text(-0.025,
             label_y_position,
             'c',
             transform=ax3.transAxes,
             fontsize=label_size,
             fontweight='bold',
             va='top',
             ha='right')
    ax3.text(0.9,
             0.55,
             '',
             transform=ax2.transAxes,
             fontsize=inset_size,
             va='top',
             ha='right')
    plt.tight_layout()
    if not os.path.exists('figures'):
        os.makedirs('figures')
    plt.savefig('figures/histograms_onerow-{}.pdf'.format(datenow))
Exemple #6
0
    print "sum :%g, mean :%g" % (np.sum(data), np.mean(data))
    return data


#--------------------------------------------------------------#
fig, ax = pl.subplots(1, figsize=(8, 10))

N = 5000
n = -2.6
xmin, xmax = 2.0, 10000.0
seed = 1234785

data = generate_power_law_dist(N, n, xmin, xmax, seed)

counter = collections.Counter(data)
pk = counter.values()
k = counter.keys()
pk = np.asarray(pk) / float(np.sum(pk))

fit = Fit(data)
fit.power_law.plot_pdf(ax=ax, linestyle=':', color='g')
# fit = Fit(data)
print fit.power_law.alpha
print fit.power_law.sigma

ax.loglog(k, pk, '.')
plot_pdf(data, color='r')

pl.show()
def plot_basics(data, data_inst, fig, units):
    from powerlaw import plot_pdf, Fit, pdf
    annotate_coord = (-.1, .95)
    # annotate_coord = (1.1, .95)

    ax1 = fig.add_subplot(n_graphs, n_data, data_inst, visible=False)
    x, y = pdf(data, linear_bins=True)
    ind = y > 0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp(ax1.get_xticklabels(), visible=False)

    # ABC
    # if data_inst == 1:
    # ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font)
    # ax1.set_ylabel(u"p(X)")
    # from mpl_toolkits.axes_grid.inset_locator import inset_axes
    # ax1in = inset_axes(ax1, width="30%", height="30%", loc=3)
    # ax1in.hist(data, density=True, color='b')
    # ax1in.set_xticks([])
    # ax1in.set_yticks([])
    # ax1.set_xlabel(units)

    ax2 = fig.add_subplot(n_graphs,
                          n_data,
                          n_data + data_inst,
                          sharex=ax1,
                          visible=False)
    plot_pdf(data, ax=ax2, color='b', linewidth=2, label="pdf of data")
    fit = Fit(data, xmin=1, discrete=True)
    fit.power_law.plot_pdf(ax=ax2,
                           linestyle=':',
                           color='g',
                           label="power law fit")
    p = fit.power_law.pdf()

    ax2.set_xlim(ax1.get_xlim())

    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2,
                           linestyle='--',
                           color='g',
                           label="power law fit--opt xmin")
    from pylab import setp
    setp(ax2.get_xticklabels(), visible=True)

    # if data_inst == 1:
    ax2.annotate("B",
                 annotate_coord,
                 xycoords="axes fraction",
                 fontproperties=panel_label_font)
    ax2.set_ylabel(u"p(X)")  # (10^n)")
    handles, labels = ax2.get_legend_handles_labels()
    ax2.legend(handles, labels, loc=3)
    ax2.set_xlabel(units)

    ax3 = fig.add_subplot(n_graphs, n_data, n_data * 2 +
                          data_inst)  # , sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3,
                           linestyle='--',
                           color='g',
                           label="power law fit\n(opt-min)")
    fit.exponential.plot_pdf(ax=ax3,
                             linestyle='--',
                             color='r',
                             label="exponential fit\n(opt-min)")

    fit.plot_pdf(ax=ax3, color='b', linewidth=2, label="PDF\n(opt-min)")

    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())
    handles, labels = ax3.get_legend_handles_labels()
    ax3.legend(handles, labels, loc=3, fontsize=12)
    ax3.set_xlabel(units, fontsize=15)

    # if data_inst == 1:
    ax3.annotate("C",
                 annotate_coord,
                 xycoords="axes fraction",
                 fontproperties=panel_label_font)
    ax3.set_ylabel(u"p(X)", fontsize=15)
def plot_basics(data, data_inst, fig, units):

    ### Setup ###
    from powerlaw import plot_pdf, Fit, pdf
    import pylab
    pylab.rcParams['xtick.major.pad'] = '8'
    pylab.rcParams['ytick.major.pad'] = '8'
    #pylab.rcParams['font.sans-serif']='Arial'

    from matplotlib.font_manager import FontProperties

    panel_label_font = FontProperties().copy()
    panel_label_font.set_weight("bold")
    panel_label_font.set_size(30.0)
    panel_label_font.set_family("sans-serif")
    n_data = 2
    n_graphs = 4
    annotate_coord = (-.4, .95)
    #############

    ax1 = fig.add_subplot(n_graphs, n_data, data_inst)
    x, y = pdf(data, linear_bins=True)
    ind = y > 0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp(ax1.get_xticklabels(), visible=False)

    if data_inst == 1:
        ax1.annotate("A",
                     annotate_coord,
                     xycoords="axes fraction",
                     fontproperties=panel_label_font)

    ax2 = fig.add_subplot(n_graphs, n_data, n_data + data_inst, sharex=ax1)

    plot_pdf(data, ax=ax2, color='b', linewidth=2)
    fit = Fit(data, xmin=1, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
    _ = fit.power_law.pdf()
    ax2.set_xlim((1, max(x)))

    setp(ax2.get_xticklabels(), visible=False)

    if data_inst == 1:
        ax2.annotate("B",
                     annotate_coord,
                     xycoords="axes fraction",
                     fontproperties=panel_label_font)
        ax2.set_ylabel(u"p(X)")  # (10^n)")

    ax3 = fig.add_subplot(n_graphs, n_data,
                          n_data * 2 + data_inst)  #, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r')
    fit.lognormal.plot_pdf(ax=ax3, linestyle=':', color='r')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)

    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())

    if data_inst == 1:
        ax3.annotate("C",
                     annotate_coord,
                     xycoords="axes fraction",
                     fontproperties=panel_label_font)

    ax3.set_xlabel(units)
Exemple #9
0
 def _get_fit_obj(self, data, xmin=None):
     # NOTE: only keep/use non-zero elements
     data = np.nonzero(data)  # TODO: confirm data is always of np.ndarray
     discrete = False if self.ds.data_nature == 'continuous' else False
     xmin = self.__get_xmin(xmin=xmin, data=data)
     return Fit(data, discrete=discrete, xmin=xmin)
 def clust_powlaw(self, G):
   # Checks if degree distribution follows power law distribution
   # Returns value of gamma for graph G
   gamma = []
   fit = Fit(sorted(G.degree().values()))
   return fit.power_law.alpha
Exemple #11
0
 def _fit_curr_data(self):
     data = self.curr_signed_returns
     data = data[np.nonzero(data)]  # only use non-zero elements to do Fit
     xmin = self.__get_xmin()
     self.curr_fit = Fit(data=data, xmin=xmin,
                         discrete=self.sa.fit_discretely)
Exemple #12
0
def out_degree(adj):
    return np.count_nonzero(adj, axis=1)


funcs = [in_sum, out_sum, in_degree, out_degree]

for i in range(len(GRAPH_TYPES)):
    g_type = GRAPH_TYPES[i]
    g_type_label = GRAPH_TYPE_LABELS[i]
    adj = load_everything(g_type, version=BRAIN_VERSION)
    for j in range(len(funcs)):
        vals = funcs[j](adj)
        vals = vals[vals > 0]
        plot_ccdf(data=vals, ax=axs[i, j])
        results = Fit(vals)
        line = results.power_law.plot_ccdf(
            ax=axs[i, j],
            c="r",
            shift_by="original_data",
            linestyle="--",
            label="Power law",
        )
        results.lognormal.plot_ccdf(
            ax=axs[i, j],
            c="g",
            shift_by="original_data",
            linestyle="--",
            label="Lognormal",
        )
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'
    archive_dir = '../data/{0}_out'.format(app_name)
    entities = ['user', 'hashtag']
    rho = 0.5272

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]

    for ax_idx, entity in enumerate(entities):
        sample_datefile = open(os.path.join(
            archive_dir, '{0}_{1}_all.txt'.format(entity, app_name)),
                               'r',
                               encoding='utf-8')
        complete_datefile = open(os.path.join(
            archive_dir, 'complete_{0}_{1}.txt'.format(entity, app_name)),
                                 'r',
                                 encoding='utf-8')

        sample_entity_freq_dict = defaultdict(int)
        complete_entity_freq_dict = defaultdict(int)
        uni_random_entity_freq_dict = defaultdict(int)

        if entity == 'user':
            for line in sample_datefile:
                sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1
            for line in complete_datefile:
                complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1
                toss = np.random.random_sample()
                if toss <= rho:
                    uni_random_entity_freq_dict[line.rstrip().split(',')
                                                [1]] += 1
        else:
            for line in sample_datefile:
                for item in line.rstrip().split(',')[1:]:
                    sample_entity_freq_dict[item.lower()] += 1
            for line in complete_datefile:
                for item in line.rstrip().split(',')[1:]:
                    complete_entity_freq_dict[item.lower()] += 1
                toss = np.random.random_sample()
                if toss <= rho:
                    for item in line.rstrip().split(',')[1:]:
                        uni_random_entity_freq_dict[item.lower()] += 1

        sample_datefile.close()
        complete_datefile.close()

        # compute the powerlaw fit in the complete set
        complete_freq_list = list(complete_entity_freq_dict.values())
        complete_powerlaw_fit = Fit(complete_freq_list)
        complete_alpha = complete_powerlaw_fit.power_law.alpha
        complete_xmin = complete_powerlaw_fit.power_law.xmin
        print('{0} complete set alpha {1}, xmin {2}'.format(
            entity, complete_alpha, complete_xmin))
        plot_ccdf(complete_freq_list,
                  ax=axes[ax_idx],
                  color='k',
                  ls='-',
                  label='complete')

        # compute the powerlaw fit in the sample set
        # infer the number of missing entities
        sample_freq_list = list(sample_entity_freq_dict.values())
        sample_freq_counter = Counter(sample_freq_list)

        # we observe the frequency of entities appearing less than 100 times
        num_interest = 100
        sample_freq_list_top100 = [0] * num_interest
        for freq in range(1, num_interest + 1):
            sample_freq_list_top100[freq - 1] = sample_freq_counter[freq]

        inferred_num_missing = infer_missing_num(sample_freq_list_top100,
                                                 rho=rho,
                                                 m=num_interest)
        corrected_sample_freq_list = sample_freq_list + [
            0
        ] * inferred_num_missing
        sample_powerlaw_fit = Fit(corrected_sample_freq_list)
        sample_alpha = sample_powerlaw_fit.power_law.alpha
        sample_xmin = sample_powerlaw_fit.power_law.xmin
        print('{0} sample set alpha {1}, xmin {2}'.format(
            entity, sample_alpha, sample_xmin))
        plot_ccdf(corrected_sample_freq_list,
                  ax=axes[ax_idx],
                  color=blue,
                  ls='-',
                  label='sample')

        # compute the powerlaw fit in uniform random sample
        uni_random_num_missing = len(complete_entity_freq_dict) - len(
            uni_random_entity_freq_dict)
        uni_random_freq_list = list(uni_random_entity_freq_dict.values())
        uni_random_freq_list = uni_random_freq_list + [
            0
        ] * uni_random_num_missing
        uni_random_powerlaw_fit = Fit(uni_random_freq_list)
        uni_random_alpha = uni_random_powerlaw_fit.power_law.alpha
        uni_random_xmin = uni_random_powerlaw_fit.power_law.xmin
        print('{0} uniform random sampling alpha {1}, xmin {2}'.format(
            entity, uni_random_alpha, uni_random_xmin))
        plot_ccdf(uni_random_freq_list,
                  ax=axes[ax_idx],
                  color='k',
                  ls='--',
                  label='uniform random')

        print('inferred missing', inferred_num_missing)
        print('empirical missing',
              len(complete_entity_freq_dict) - len(sample_entity_freq_dict))
        print('uniform random missing', uni_random_num_missing)

        print('KS test (sample, uniform)')
        print(stats.ks_2samp(corrected_sample_freq_list, uni_random_freq_list))

        print('KS test (sample, complete)')
        print(stats.ks_2samp(corrected_sample_freq_list, complete_freq_list))

        print('KS test (uniform, complete)')
        print(stats.ks_2samp(uni_random_freq_list, complete_freq_list))

        axes[ax_idx].set_xscale('symlog')
        axes[ax_idx].set_yscale('log')
        axes[ax_idx].set_xlabel('frequency', fontsize=16)
        axes[ax_idx].tick_params(axis='both', which='major', labelsize=16)

    axes[0].set_xticks([0, 1, 100, 10000])
    axes[0].set_yticks([1, 0.01, 0.0001, 0.000001])
    axes[0].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='lower left')
    axes[0].set_title('(a) user posting', fontsize=18, pad=-3 * 72, y=1.0001)

    axes[1].set_xticks([0, 1, 100, 10000, 1000000])
    axes[1].set_yticks([1, 0.1, 0.001, 0.00001])
    axes[1].set_title('(b) hashtag', fontsize=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/entity_freq_dist.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    target_day_indices = [0, 15, 30, 45]
    color_cycle_4 = ColorPalette.CC4
    date_labels = [
        'Sep 01, 2018', 'Sep 16, 2018', 'Oct 01, 2018', 'Oct 16, 2018'
    ]

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    num_videos = data_loader.num_videos

    target_day_view_list = [[], [], [], []]
    for embed in range(num_videos):
        for target_idx, target_day in enumerate(target_day_indices):
            target_day_view_list[target_idx].append(
                embed_view_dict[embed][target_day])

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    embed_indegree_dict = {
        embed: np.zeros((T, ))
        for embed in np.arange(num_videos)
    }  # daily indegree for each embed
    zero_indegree_list = []  # percentage of zero indegree for each day
    num_edges_list = []  # number of total edges for each day
    for t in range(T):
        filename = 'network_{0}.p'.format(
            (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d'))
        indegree_list = []
        with open(os.path.join(data_prefix, 'network_pickle', filename),
                  'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src), ...]
            for tar_embed in range(num_videos):
                indegree_value = len(
                    [1 for x in network_dict[tar_embed] if x[1] < NUM_REL])
                embed_indegree_dict[tar_embed][t] = indegree_value
                indegree_list.append(indegree_value)
        indegree_counter = Counter(indegree_list)
        zero_indegree_list.append(indegree_counter[0] / num_videos)
        num_edges_list.append(sum(indegree_list))
        print('>>> Finish loading day {0}...'.format(t + 1))
    print('>>> Network structure has been loaded!')
    print('\n>>> Average number of edges: {0:.0f}, max: {1:.0f}, min: {2:.0f}'.
          format(
              sum(num_edges_list) / len(num_edges_list), max(num_edges_list),
              min(num_edges_list)))

    fig, axes = plt.subplots(1, 3, figsize=(12, 4.5))
    ax1, ax2, ax3 = axes.ravel()

    # == == == == == == Part 4: Plot ax1 indegree CCDF == == == == == == #
    embed_avg_indegree_dict = defaultdict(float)
    for t in range(T):
        for embed in range(num_videos):
            embed_avg_indegree_dict[embed] += embed_indegree_dict[embed][t] / T

    indegree_ranked_embed_list = [
        x[0] for x in sorted(embed_avg_indegree_dict.items(),
                             key=lambda kv: kv[1],
                             reverse=True)
    ]
    top_20_indegree_embeds = indegree_ranked_embed_list[:20]
    popular_ranked_embed_list = [
        x[0] for x in sorted(
            embed_avg_view_dict.items(), key=lambda kv: kv[1], reverse=True)
    ]
    top_20_popular_embeds = popular_ranked_embed_list[:20]

    for target_idx, target_day in enumerate(target_day_indices):
        indegree_list = []
        for embed in range(num_videos):
            indegree_list.append(embed_indegree_dict[embed][target_day])

        print(
            'video with 10 indegree has more in-links than {0:.2f}% videos on date {1}'
            .format(percentileofscore(indegree_list, 10),
                    date_labels[target_idx]))
        print(
            'video with 20 indegree has more in-links than {0:.2f}% videos on date {1}'
            .format(percentileofscore(indegree_list, 20),
                    date_labels[target_idx]))

        plot_ccdf(indegree_list,
                  ax=ax1,
                  color=color_cycle_4[target_idx],
                  label=date_labels[target_idx])

    # compute the powerlaw fit
    powerlaw_fit = Fit(list(embed_avg_indegree_dict.values()))
    infer_alpha = powerlaw_fit.power_law.alpha
    p = powerlaw_fit.power_law.ccdf()
    ins_x_axis = powerlaw_fit.power_law.__dict__['parent_Fit'].__dict__[
        'data'][:int(0.9 * len(p))]
    ins_y_axis = 0.1 * p[:int(0.9 * len(p))]

    ax1.plot(ins_x_axis, ins_y_axis, 'k:')
    ax1.text(0.4,
             0.6,
             r'$x^{{{0:.2f}}}$'.format(-infer_alpha + 1),
             size=12,
             ha='right',
             va='bottom',
             transform=ax1.transAxes)
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.set_xlabel('indegree', fontsize=11)
    ax1.set_ylabel('$P(X) \geq x$', fontsize=11)
    ax1.tick_params(axis='both', which='major', labelsize=10)
    ax1.set_title('(a) indegree distribution', fontsize=12)

    ax1.legend(frameon=False, fontsize=11, ncol=1, fancybox=False, shadow=True)

    mean_zero_indegree = sum(zero_indegree_list) / len(zero_indegree_list)

    ax1.axhline(y=1 - mean_zero_indegree, color='k', linestyle='--', zorder=30)
    ax1.text(0.96,
             0.9,
             '{0:.0f}% with 0 indegree'.format(mean_zero_indegree * 100),
             size=11,
             transform=ax1.transAxes,
             ha='right',
             va='top')

    # == == == == == == Part 5: Plot ax2 views distribution == == == == == == #
    for target_idx, views_list in enumerate(target_day_view_list):
        x_values = range(100)
        y_values = [np.percentile(views_list, x) for x in x_values]
        ax2.plot(x_values,
                 y_values,
                 color=color_cycle_4[target_idx],
                 label=date_labels[target_idx])
    ax2.set_yscale('log')
    ax2.set_xlabel('views percentile', fontsize=11)
    ax2.set_ylabel('num of views', fontsize=11)
    ax2.tick_params(axis='both', which='major', labelsize=10)
    ax2.set_title('(b) daily views vs. its percentile', fontsize=12)

    avg_views_list = sorted(list(embed_avg_view_dict.values()), reverse=True)
    gini_coef = gini(avg_views_list)
    print('top 1% videos occupy {0:.2f}% views'.format(
        sum(avg_views_list[:int(0.01 * num_videos)]) / sum(avg_views_list) *
        100))
    print('top 10% videos occupy {0:.2f}% views'.format(
        sum(avg_views_list[:int(0.1 * num_videos)]) / sum(avg_views_list) *
        100))
    print('Gini coef: {0:.3f}'.format(gini_coef))

    spearman_degree = [
        embed_avg_indegree_dict[embed] for embed in range(num_videos)
    ]
    spearman_views = [
        embed_avg_view_dict[embed] for embed in range(num_videos)
    ]

    print(
        'Spearman correlation between views and indegree: {0:.4f}, pvalue: {1:.2f}'
        .format(*spearmanr(spearman_views, spearman_degree)))

    median_views = np.median(avg_views_list)
    top_views_90th = np.percentile(avg_views_list, 90)
    top_views_99th = np.percentile(avg_views_list, 99)
    ax2_xmin = ax2.get_xlim()[0]
    ax2_ymin = ax2.get_ylim()[0]

    ax2.plot((50, 50), (ax2_ymin, median_views),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 50), (median_views, median_views),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.49,
             0.45,
             'median views {0:,.0f}'.format(median_views),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    ax2.plot((90, 90), (ax2_ymin, top_views_90th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 90), (top_views_90th, top_views_90th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.88,
             0.75,
             '90th views {0:,.0f}'.format(top_views_90th),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    ax2.plot((99, 99), (ax2_ymin, top_views_99th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 99), (top_views_99th, top_views_99th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.91,
             0.95,
             '99th views {0:,.0f}'.format(top_views_99th),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    # == == == == == == Part 7: Plot ax3 video uploading trend == == == == == == #
    x_axis = range(2009, 2018)
    x_labels = ["'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16", "'17"]
    upload_mat = np.zeros((len(x_axis), 8))

    target_topics = [
        'Pop_music', 'Rock_music', 'Hip_hop_music', 'Independent_music',
        'Country_music', 'Electronic_music', 'Soul_music', 'Others'
    ]
    topic_labels = [
        'Pop', 'Rock', 'Hip hop', 'Independent', 'Country', 'Electronic',
        'Soul', 'Others'
    ]

    color_cycle_8 = ColorPalette.CC8

    data_loader.load_embed_content_dict()
    embed_title_dict = data_loader.embed_title_dict
    embed_uploadtime_dict = data_loader.embed_uploadtime_dict
    embed_genre_dict = data_loader.embed_genre_dict

    for embed in range(num_videos):
        upload_year = int(embed_uploadtime_dict[embed][:4])
        if 2009 <= upload_year <= 2017:
            year_idx = upload_year - 2009

            genres = embed_genre_dict[embed]
            if len(genres) == 0:
                # add one to "Others" genre
                upload_mat[year_idx, 7] += 1
            else:
                for genre in genres:
                    upload_mat[year_idx,
                               target_topics.index(genre)] += 1 / len(genres)

    print()
    print([
        '{0}: {1}'.format(topic, int(num))
        for topic, num in zip(target_topics, np.sum(upload_mat, axis=0))
    ])

    stackedBarPlot(ax=ax3,
                   data=upload_mat,
                   cols=color_cycle_8,
                   edgeCols=['#000000'] * 8,
                   xlabel='uploaded year',
                   ylabel='num of videos',
                   scale=False,
                   endGaps=True)

    ax3.tick_params(axis='both', which='major', labelsize=9)
    ax3.set_xticks(np.arange(len(x_axis)))
    ax3.set_xticklabels(x_labels)
    ax3.yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    ax3.legend([
        plt.Rectangle((0, 0), 1, 1, fc=c, ec='k', alpha=0.6)
        for c in color_cycle_8
    ],
               topic_labels,
               fontsize=9,
               frameon=False,
               handletextpad=0.2,
               columnspacing=0.3,
               ncol=4,
               bbox_to_anchor=(1, -0.12),
               bbox_transform=ax3.transAxes,
               fancybox=False,
               shadow=True)
    ax3.set_title('(c) VEVO videos uploading trend', fontsize=12)

    union_top_set = set(top_20_indegree_embeds).union(top_20_popular_embeds)
    print('\n>>> Size of the union set at cutoff 15:', len(union_top_set))
    print('{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'.
          format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views',
                 '-rank'))
    for embed in top_20_indegree_embeds:
        print(
            '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\'
            .format(
                embed_title_dict[embed].split(
                    ' - ', 1)[1].split('(')[0].split('ft')[0].strip(),
                embed_title_dict[embed].split(
                    ' - ',
                    1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format(
                        (datetime(2018, 11, 2) -
                         str2obj(embed_uploadtime_dict[embed])).days),
                '{0:,}'.format(int(embed_avg_indegree_dict[embed])),
                '{0:,}'.format(top_20_indegree_embeds.index(embed) + 1),
                '{0:,}'.format(int(embed_avg_view_dict[embed])),
                '{0:,}'.format(popular_ranked_embed_list.index(embed) + 1)))

    print('\n{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'.
          format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views',
                 '-rank'))
    for embed in top_20_popular_embeds:
        print(
            '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\'
            .format(
                embed_title_dict[embed].split(
                    ' - ', 1)[1].split('(')[0].split('ft')[0].strip(),
                embed_title_dict[embed].split(
                    ' - ',
                    1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format(
                        (datetime(2018, 11, 2) -
                         str2obj(embed_uploadtime_dict[embed])).days),
                '{0:,}'.format(int(embed_avg_indegree_dict[embed])),
                '{0:,}'.format(indegree_ranked_embed_list.index(embed) + 1),
                '{0:,}'.format(int(embed_avg_view_dict[embed])),
                '{0:,}'.format(top_20_popular_embeds.index(embed) + 1)))

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/measure_basic_statistics.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
Exemple #15
0
def plplot(data, title, save=False, save_path=None):
    data = np.array(data)

    fig = plt.figure(figsize=(18,6))
    fig.suptitle(title)
    
    # === A ===
    ax1 = fig.add_subplot(1,3,1)

    # 线性x轴
    x, y = pdf(data, linear_bins=True)
    ind = y>0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)

    # 双log-绘制概率密度曲线
    plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2)

    ax1.set_xlabel('A')
    
    # 绘制histogram小图
    from mpl_toolkits.axes_grid.inset_locator import inset_axes
    ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3)
    ax1in.hist(data, normed=True, color='b')
    ax1in.set_xticks([])
    ax1in.set_yticks([])

    # === A ===

    # === B ===
    
    annotation = ''
    ax2 = fig.add_subplot(1,3,2, sharey=ax1)

    # 双log-绘制概率密度曲线
    print(title)
    print(pdf(data))
    print()
    plot_pdf(data, ax=ax2, color='b', linewidth=2)

    # 拟合power-law函数并绘图
    fit = Fit(data, xmin=1, discrete=True, parameter_range={'alpha':[None,None]})
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g')
    params1 = (fit.power_law.alpha, fit.power_law.xmin, fit.power_law.sigma)

    # alpha为拟合系数
    # xmin表示最小的x值(使不为0),此处指定为1
    # sigma为标准差
    annotation += '\':\' - alpha={:.2f}, xmin= {}, sigma={:.2f}'.format(*params1)
    # p = fit.power_law.pdf()
    
    fit = Fit(data, discrete=True, parameter_range={'alpha':[-5,10]})
    # 区别于ax2中的第一条拟合线 - 此处的xmin并非指定,而是自动计算的optimal
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
    params2 = (fit.power_law.alpha, fit.power_law.xmin, fit.power_law.sigma)
    annotation += '\n\'--\' - alpha={:.2f}, xmin= {}, sigma={:.2f}'.format(*params2)

    ax2.set_xlabel('B')
    ax2.set_ylabel(u"p(X)")# (10^n)")
    ax2.set_xlim(ax1.get_xlim())
    annotate_coord = (0.05, 0.88)
    ax2.annotate(annotation, annotate_coord, xycoords="axes fraction")
        
    # === B ===

    # === C ===

    ax3 = fig.add_subplot(1,3,3, sharey=ax1)#, sharex=ax1)#, sharey=ax2)
    plot_pdf(data[data>0], ax=ax3, color='b', linewidth=2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r')

    
    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())

    ax3.set_xlabel('C')

    # === C ===

    if save:
        plt.savefig(save_path)
    else:
        plt.show()

    return params1, params2
Exemple #16
0
def plot_basics(data, data_inst, fig, units):
    from powerlaw import plot_pdf, Fit, pdf
    annotate_coord = (-.4, .95)
    ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
    plot_pdf(data[data>0], ax=ax1, linear_bins=True, color='r', linewidth=.5)
    x, y = pdf(data, linear_bins=True)
    ind = y>0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp( ax1.get_xticklabels(), visible=False)
    #ax1.set_xticks(ax1.get_xticks()[::2])
    ax1.set_yticks(ax1.get_yticks()[::2])
    locs,labels = yticks()
    #yticks(locs, map(lambda x: "%.0f" % x, log10(locs)))
    if data_inst==1:
        ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontsize=14)

    
    from mpl_toolkits.axes_grid.inset_locator import inset_axes
    ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3)
    ax1in.hist(data, normed=True, color='b')
    ax1in.set_xticks([])
    ax1in.set_yticks([])

    
    ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1)
    plot_pdf(data, ax=ax2, color='b', linewidth=2)
    fit = Fit(data, xmin=1, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g')
    p = fit.power_law.pdf()
    #ax2.set_ylim(min(p), max(p))
    ax2.set_xlim(ax1.get_xlim())
    
    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
    from pylab import setp
    setp( ax2.get_xticklabels(), visible=False)
    #ax2.set_xticks(ax2.get_xticks()[::2])
    if ax2.get_ylim()[1] >1:
        ax2.set_ylim(ax2.get_ylim()[0], 1)
    
    ax2.set_yticks(ax2.get_yticks()[::2])
    #locs,labels = yticks()
    #yticks(locs, map(lambda x: "%.0f" % x, log10(locs)))
    if data_inst==1:
       ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontsize=14)        
       ax2.set_ylabel(r"$p(X)$")# (10^n)")
        
    ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst)#, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)
    
    #p = fit.power_law.pdf()
    ax3.set_ylim(ax2.get_ylim())
    ax3.set_yticks(ax3.get_yticks()[::2])
    ax3.set_xlim(ax1.get_xlim())
    
    #locs,labels = yticks()
    #yticks(locs, map(lambda x: "%.0f" % x, log10(locs)))
    if data_inst==1:
        ax3.annotate("C", annotate_coord, xycoords="axes fraction", fontsize=14)

    #if ax2.get_xlim()!=ax3.get_xlim():
    #    zoom_effect01(ax2, ax3, ax3.get_xlim()[0], ax3.get_xlim()[1])
    ax3.set_xlabel(units)