def plot_basics(data, data_inst, fig, units): ''' This function is the main plotting function. Adapted from Newman's powerlaw package. ''' import pylab pylab.rcParams['xtick.major.pad']='8' pylab.rcParams['ytick.major.pad']='8' pylab.rcParams['font.sans-serif']='Arial' from matplotlib import rc rc('font', family='sans-serif') rc('font', size=10.0) rc('text', usetex=False) from matplotlib.font_manager import FontProperties panel_label_font = FontProperties().copy() panel_label_font.set_weight("bold") panel_label_font.set_size(12.0) panel_label_font.set_family("sans-serif") n_data = 1 n_graphs = 4 from powerlaw import plot_pdf, Fit, pdf ax1 = fig.add_subplot(n_graphs,n_data,data_inst) x, y = pdf(data, linear_bins=True) ind = y>0 y = y[ind] x = x[:-1] x = x[ind] ax1.scatter(x, y, color='r', s=.5, label='data') plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2, label='PDF') from pylab import setp setp( ax1.get_xticklabels(), visible=False) plt.legend(loc = 'bestloc') ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1) plot_pdf(data[data>0], ax=ax2, color='b', linewidth=2, label='PDF') fit = Fit(data, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g',label='w/o xmin') p = fit.power_law.pdf() ax2.set_xlim(ax1.get_xlim()) fit = Fit(data, discrete=True,xmin=3) fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g', label='w xmin') from pylab import setp setp(ax2.get_xticklabels(), visible=False) plt.legend(loc = 'bestloc') ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst)#, sharex=ax1)#, sharey=ax2) fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g',label='powerlaw') fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r',label='exp') fit.plot_pdf(ax=ax3, color='b', linewidth=2) ax3.set_ylim(ax2.get_ylim()) ax3.set_xlim(ax1.get_xlim()) plt.legend(loc = 'bestloc') ax3.set_xlabel(units)
def draw_plots(self): from matplotlib import pyplot as plt fig = plt.figure(figsize=(4, 4)) ax = fig.add_subplot(111) data = self.on_data() from powerlaw import Fit experimental = Fit(data, xmin=min(data)) experimental.plot_ccdf(ax=ax) plt.show()
def plot_powerlaw_combined(data, data_inst, fig, units): from powerlaw import plot_pdf, Fit, pdf annotate_coord = (-.4, .95) ax1 = fig.add_subplot(n_graphs,n_data,data_inst) plot_pdf(data, ax=ax1, color='b', linewidth=2) fit = Fit(data, xmin=1, discrete=True) fit.power_law.plot_pdf(ax=ax1, linestyle=':', color='g') p = fit.power_law.pdf() fit = Fit(data, discrete=True) fit.power_law.plot_pdf(ax=ax1, linestyle='--', color='g') from pylab import setp setp( ax1.get_xticklabels(), visible=False) if data_inst==1: ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontsize=14) ax1.set_ylabel(r"$p(X)$")# (10^n)") ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst)#, sharex=ax1)#, sharey=ax2) fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g') fit.exponential.plot_pdf(ax=ax2, linestyle='--', color='r') fit.plot_pdf(ax=ax2, color='b', linewidth=2) ax2.set_ylim(ax1.get_ylim()) ax2.set_yticks(ax2.get_yticks()[::2]) ax2.set_xlim(ax1.get_xlim()) if data_inst==1: ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontsize=14) ax2.set_xlabel(units)
def plot_basics(data, data_inst, fig, units): ''' This function is the main plotting function. Adapted from Newman's powerlaw package. ''' import pylab pylab.rcParams['xtick.major.pad'] = '8' pylab.rcParams['ytick.major.pad'] = '8' pylab.rcParams['font.sans-serif'] = 'Arial' from matplotlib import rc rc('font', family='sans-serif') rc('font', size=10.0) rc('text', usetex=False) from matplotlib.font_manager import FontProperties panel_label_font = FontProperties().copy() panel_label_font.set_weight("bold") panel_label_font.set_size(12.0) panel_label_font.set_family("sans-serif") n_data = 1 n_graphs = 4 from powerlaw import plot_pdf, Fit, pdf ax1 = fig.add_subplot(n_graphs, n_data, data_inst) x, y = pdf(data, linear_bins=True) ind = y > 0 y = y[ind] x = x[:-1] x = x[ind] ax1.scatter(x, y, color='r', s=.5, label='data') plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2, label='PDF') from pylab import setp setp(ax1.get_xticklabels(), visible=False) plt.legend(loc='bestloc') ax2 = fig.add_subplot(n_graphs, n_data, n_data + data_inst, sharex=ax1) plot_pdf(data[data > 0], ax=ax2, color='b', linewidth=2, label='PDF') fit = Fit(data, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g', label='w/o xmin') p = fit.power_law.pdf() ax2.set_xlim(ax1.get_xlim()) fit = Fit(data, discrete=True, xmin=3) fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g', label='w xmin') from pylab import setp setp(ax2.get_xticklabels(), visible=False) plt.legend(loc='bestloc') ax3 = fig.add_subplot(n_graphs, n_data, n_data * 2 + data_inst) #, sharex=ax1)#, sharey=ax2) fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g', label='powerlaw') fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r', label='exp') fit.plot_pdf(ax=ax3, color='b', linewidth=2) ax3.set_ylim(ax2.get_ylim()) ax3.set_xlim(ax1.get_xlim()) plt.legend(loc='bestloc') ax3.set_xlabel(units)
def distribution_compare_dict(fit: powerlaw.Fit) -> Dict[str, float]: """ Compose a dict of length distribution fit comparisons. """ compare_dict = dict() for dist_enum_pairs in [ (Dist.POWERLAW, Dist.LOGNORMAL), (Dist.POWERLAW, Dist.EXPONENTIAL), (Dist.LOGNORMAL, Dist.EXPONENTIAL), (Dist.POWERLAW, Dist.TRUNCATED_POWERLAW), ]: first, second = dist_enum_pairs[0].value, dist_enum_pairs[1].value r, p = fit.distribution_compare(first, second, normalized_ratio=True) compare_dict[f"{first} vs. {second} R"] = r compare_dict[f"{first} vs. {second} p"] = p return compare_dict
def plot_basics(data, data_inst, fig, units): from powerlaw import plot_pdf, Fit, pdf annotate_coord = (-.4, .95) ax1 = fig.add_subplot(n_graphs,n_data,data_inst) x, y = pdf(data, linear_bins=True) ind = y>0 y = y[ind] x = x[:-1] x = x[ind] ax1.scatter(x, y, color='r', s=.5) plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2) from pylab import setp setp( ax1.get_xticklabels(), visible=False) if data_inst==1: ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) from mpl_toolkits.axes_grid.inset_locator import inset_axes ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3) ax1in.hist(data, normed=True, color='b') ax1in.set_xticks([]) ax1in.set_yticks([]) ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1) plot_pdf(data, ax=ax2, color='b', linewidth=2) fit = Fit(data, xmin=1, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g') p = fit.power_law.pdf() ax2.set_xlim(ax1.get_xlim()) fit = Fit(data, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g') from pylab import setp setp( ax2.get_xticklabels(), visible=False) if data_inst==1: ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) ax2.set_ylabel(u"p(X)")# (10^n)") ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst)#, sharex=ax1)#, sharey=ax2) fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g') fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r') fit.plot_pdf(ax=ax3, color='b', linewidth=2) ax3.set_ylim(ax2.get_ylim()) ax3.set_xlim(ax1.get_xlim()) if data_inst==1: ax3.annotate("C", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) ax3.set_xlabel(units)
def normalize_fit_to_area( fit: powerlaw.Fit, length_distribution: LengthDistribution ) -> Tuple[np.ndarray, np.ndarray]: """ Normalize powerlaw.fit ccdf to area value. """ # Get the full length data along with full ccm using the original # data instead of fitted (should be same with cut_off==0.0) full_length_array, full_ccm_array = fit.ccdf(original_data=True) # Get boolean array where length is over cut_off are_over_cut_off = full_length_array > fit.xmin assert isinstance(are_over_cut_off, np.ndarray) assert sum(are_over_cut_off) > 0 # Cut lengths and corresponding ccm to indexes where are over cut off truncated_length_array = full_length_array[are_over_cut_off] ccm_array = full_ccm_array[are_over_cut_off] area_value = length_distribution.area_value assert area_value > 0 # Normalize ccm with area value logging.info( "Normalizing ccm with area_value.", extra=dict( area_value=area_value, ccm_array_description=pd.Series(ccm_array).describe().to_dict(), ), ) ccm_array_normed = ccm_array / area_value logging.info( "Normalized fit ccm.", extra=dict( sum_are_over_cut_off=sum(are_over_cut_off), fit_xmin=fit.xmin, amount_filtered=len(full_length_array) - len(truncated_length_array), length_distribution_area_value=area_value, ), ) return truncated_length_array, ccm_array_normed
def clust_powlaw(self, G): # Checks if degree distribution follows power law distribution # Returns value of gamma for graph G gamma = [] fit = Fit(sorted(G.degree().values())) return fit.power_law.alpha
def plplot(data, title, save=False, save_path=None): data = np.array(data) fig = plt.figure(figsize=(18,6)) fig.suptitle(title) # === A === ax1 = fig.add_subplot(1,3,1) # 线性x轴 x, y = pdf(data, linear_bins=True) ind = y>0 y = y[ind] x = x[:-1] x = x[ind] ax1.scatter(x, y, color='r', s=.5) # 双log-绘制概率密度曲线 plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2) ax1.set_xlabel('A') # 绘制histogram小图 from mpl_toolkits.axes_grid.inset_locator import inset_axes ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3) ax1in.hist(data, normed=True, color='b') ax1in.set_xticks([]) ax1in.set_yticks([]) # === A === # === B === annotation = '' ax2 = fig.add_subplot(1,3,2, sharey=ax1) # 双log-绘制概率密度曲线 print(title) print(pdf(data)) print() plot_pdf(data, ax=ax2, color='b', linewidth=2) # 拟合power-law函数并绘图 fit = Fit(data, xmin=1, discrete=True, parameter_range={'alpha':[None,None]}) fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g') params1 = (fit.power_law.alpha, fit.power_law.xmin, fit.power_law.sigma) # alpha为拟合系数 # xmin表示最小的x值(使不为0),此处指定为1 # sigma为标准差 annotation += '\':\' - alpha={:.2f}, xmin= {}, sigma={:.2f}'.format(*params1) # p = fit.power_law.pdf() fit = Fit(data, discrete=True, parameter_range={'alpha':[-5,10]}) # 区别于ax2中的第一条拟合线 - 此处的xmin并非指定,而是自动计算的optimal fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g') params2 = (fit.power_law.alpha, fit.power_law.xmin, fit.power_law.sigma) annotation += '\n\'--\' - alpha={:.2f}, xmin= {}, sigma={:.2f}'.format(*params2) ax2.set_xlabel('B') ax2.set_ylabel(u"p(X)")# (10^n)") ax2.set_xlim(ax1.get_xlim()) annotate_coord = (0.05, 0.88) ax2.annotate(annotation, annotate_coord, xycoords="axes fraction") # === B === # === C === ax3 = fig.add_subplot(1,3,3, sharey=ax1)#, sharex=ax1)#, sharey=ax2) plot_pdf(data[data>0], ax=ax3, color='b', linewidth=2) fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g') fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r') ax3.set_ylim(ax2.get_ylim()) ax3.set_xlim(ax1.get_xlim()) ax3.set_xlabel('C') # === C === if save: plt.savefig(save_path) else: plt.show() return params1, params2
print "sum :%g, mean :%g" % (np.sum(data), np.mean(data)) return data #--------------------------------------------------------------# fig, ax = pl.subplots(1, figsize=(8, 10)) N = 5000 n = -2.6 xmin, xmax = 2.0, 10000.0 seed = 1234785 data = generate_power_law_dist(N, n, xmin, xmax, seed) counter = collections.Counter(data) pk = counter.values() k = counter.keys() pk = np.asarray(pk) / float(np.sum(pk)) fit = Fit(data) fit.power_law.plot_pdf(ax=ax, linestyle=':', color='g') # fit = Fit(data) print fit.power_law.alpha print fit.power_law.sigma ax.loglog(k, pk, '.') plot_pdf(data, color='r') pl.show()
def plot_basics(data, data_inst, fig, units): from powerlaw import plot_pdf, Fit, pdf annotate_coord = (-.1, .95) # annotate_coord = (1.1, .95) ax1 = fig.add_subplot(n_graphs, n_data, data_inst, visible=False) x, y = pdf(data, linear_bins=True) ind = y > 0 y = y[ind] x = x[:-1] x = x[ind] ax1.scatter(x, y, color='r', s=.5) plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2) from pylab import setp setp(ax1.get_xticklabels(), visible=False) # ABC # if data_inst == 1: # ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) # ax1.set_ylabel(u"p(X)") # from mpl_toolkits.axes_grid.inset_locator import inset_axes # ax1in = inset_axes(ax1, width="30%", height="30%", loc=3) # ax1in.hist(data, density=True, color='b') # ax1in.set_xticks([]) # ax1in.set_yticks([]) # ax1.set_xlabel(units) ax2 = fig.add_subplot(n_graphs, n_data, n_data + data_inst, sharex=ax1, visible=False) plot_pdf(data, ax=ax2, color='b', linewidth=2, label="pdf of data") fit = Fit(data, xmin=1, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g', label="power law fit") p = fit.power_law.pdf() ax2.set_xlim(ax1.get_xlim()) fit = Fit(data, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g', label="power law fit--opt xmin") from pylab import setp setp(ax2.get_xticklabels(), visible=True) # if data_inst == 1: ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) ax2.set_ylabel(u"p(X)") # (10^n)") handles, labels = ax2.get_legend_handles_labels() ax2.legend(handles, labels, loc=3) ax2.set_xlabel(units) ax3 = fig.add_subplot(n_graphs, n_data, n_data * 2 + data_inst) # , sharex=ax1)#, sharey=ax2) fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g', label="power law fit\n(opt-min)") fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r', label="exponential fit\n(opt-min)") fit.plot_pdf(ax=ax3, color='b', linewidth=2, label="PDF\n(opt-min)") ax3.set_ylim(ax2.get_ylim()) ax3.set_xlim(ax1.get_xlim()) handles, labels = ax3.get_legend_handles_labels() ax3.legend(handles, labels, loc=3, fontsize=12) ax3.set_xlabel(units, fontsize=15) # if data_inst == 1: ax3.annotate("C", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) ax3.set_ylabel(u"p(X)", fontsize=15)
def plot_basics(data, data_inst, fig, units): ### Setup ### from powerlaw import plot_pdf, Fit, pdf import pylab pylab.rcParams['xtick.major.pad'] = '8' pylab.rcParams['ytick.major.pad'] = '8' #pylab.rcParams['font.sans-serif']='Arial' from matplotlib.font_manager import FontProperties panel_label_font = FontProperties().copy() panel_label_font.set_weight("bold") panel_label_font.set_size(30.0) panel_label_font.set_family("sans-serif") n_data = 2 n_graphs = 4 annotate_coord = (-.4, .95) ############# ax1 = fig.add_subplot(n_graphs, n_data, data_inst) x, y = pdf(data, linear_bins=True) ind = y > 0 y = y[ind] x = x[:-1] x = x[ind] ax1.scatter(x, y, color='r', s=.5) plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2) from pylab import setp setp(ax1.get_xticklabels(), visible=False) if data_inst == 1: ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) ax2 = fig.add_subplot(n_graphs, n_data, n_data + data_inst, sharex=ax1) plot_pdf(data, ax=ax2, color='b', linewidth=2) fit = Fit(data, xmin=1, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g') _ = fit.power_law.pdf() ax2.set_xlim((1, max(x))) setp(ax2.get_xticklabels(), visible=False) if data_inst == 1: ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) ax2.set_ylabel(u"p(X)") # (10^n)") ax3 = fig.add_subplot(n_graphs, n_data, n_data * 2 + data_inst) #, sharex=ax1)#, sharey=ax2) fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g') fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r') fit.lognormal.plot_pdf(ax=ax3, linestyle=':', color='r') fit.plot_pdf(ax=ax3, color='b', linewidth=2) ax3.set_ylim(ax2.get_ylim()) ax3.set_xlim(ax1.get_xlim()) if data_inst == 1: ax3.annotate("C", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font) ax3.set_xlabel(units)
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' target_day_indices = [0, 15, 30, 45] color_cycle_4 = ColorPalette.CC4 date_labels = [ 'Sep 01, 2018', 'Sep 16, 2018', 'Oct 01, 2018', 'Oct 16, 2018' ] # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict embed_avg_view_dict = data_loader.embed_avg_view_dict num_videos = data_loader.num_videos target_day_view_list = [[], [], [], []] for embed in range(num_videos): for target_idx, target_day in enumerate(target_day_indices): target_day_view_list[target_idx].append( embed_view_dict[embed][target_day]) # == == == == == == Part 3: Load dynamic network snapshot == == == == == == # embed_indegree_dict = { embed: np.zeros((T, )) for embed in np.arange(num_videos) } # daily indegree for each embed zero_indegree_list = [] # percentage of zero indegree for each day num_edges_list = [] # number of total edges for each day for t in range(T): filename = 'network_{0}.p'.format( (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d')) indegree_list = [] with open(os.path.join(data_prefix, 'network_pickle', filename), 'rb') as fin: network_dict = pickle.load(fin) # embed_tar: [(embed_src, pos_src, view_src), ...] for tar_embed in range(num_videos): indegree_value = len( [1 for x in network_dict[tar_embed] if x[1] < NUM_REL]) embed_indegree_dict[tar_embed][t] = indegree_value indegree_list.append(indegree_value) indegree_counter = Counter(indegree_list) zero_indegree_list.append(indegree_counter[0] / num_videos) num_edges_list.append(sum(indegree_list)) print('>>> Finish loading day {0}...'.format(t + 1)) print('>>> Network structure has been loaded!') print('\n>>> Average number of edges: {0:.0f}, max: {1:.0f}, min: {2:.0f}'. format( sum(num_edges_list) / len(num_edges_list), max(num_edges_list), min(num_edges_list))) fig, axes = plt.subplots(1, 3, figsize=(12, 4.5)) ax1, ax2, ax3 = axes.ravel() # == == == == == == Part 4: Plot ax1 indegree CCDF == == == == == == # embed_avg_indegree_dict = defaultdict(float) for t in range(T): for embed in range(num_videos): embed_avg_indegree_dict[embed] += embed_indegree_dict[embed][t] / T indegree_ranked_embed_list = [ x[0] for x in sorted(embed_avg_indegree_dict.items(), key=lambda kv: kv[1], reverse=True) ] top_20_indegree_embeds = indegree_ranked_embed_list[:20] popular_ranked_embed_list = [ x[0] for x in sorted( embed_avg_view_dict.items(), key=lambda kv: kv[1], reverse=True) ] top_20_popular_embeds = popular_ranked_embed_list[:20] for target_idx, target_day in enumerate(target_day_indices): indegree_list = [] for embed in range(num_videos): indegree_list.append(embed_indegree_dict[embed][target_day]) print( 'video with 10 indegree has more in-links than {0:.2f}% videos on date {1}' .format(percentileofscore(indegree_list, 10), date_labels[target_idx])) print( 'video with 20 indegree has more in-links than {0:.2f}% videos on date {1}' .format(percentileofscore(indegree_list, 20), date_labels[target_idx])) plot_ccdf(indegree_list, ax=ax1, color=color_cycle_4[target_idx], label=date_labels[target_idx]) # compute the powerlaw fit powerlaw_fit = Fit(list(embed_avg_indegree_dict.values())) infer_alpha = powerlaw_fit.power_law.alpha p = powerlaw_fit.power_law.ccdf() ins_x_axis = powerlaw_fit.power_law.__dict__['parent_Fit'].__dict__[ 'data'][:int(0.9 * len(p))] ins_y_axis = 0.1 * p[:int(0.9 * len(p))] ax1.plot(ins_x_axis, ins_y_axis, 'k:') ax1.text(0.4, 0.6, r'$x^{{{0:.2f}}}$'.format(-infer_alpha + 1), size=12, ha='right', va='bottom', transform=ax1.transAxes) ax1.set_xscale('log') ax1.set_yscale('log') ax1.set_xlabel('indegree', fontsize=11) ax1.set_ylabel('$P(X) \geq x$', fontsize=11) ax1.tick_params(axis='both', which='major', labelsize=10) ax1.set_title('(a) indegree distribution', fontsize=12) ax1.legend(frameon=False, fontsize=11, ncol=1, fancybox=False, shadow=True) mean_zero_indegree = sum(zero_indegree_list) / len(zero_indegree_list) ax1.axhline(y=1 - mean_zero_indegree, color='k', linestyle='--', zorder=30) ax1.text(0.96, 0.9, '{0:.0f}% with 0 indegree'.format(mean_zero_indegree * 100), size=11, transform=ax1.transAxes, ha='right', va='top') # == == == == == == Part 5: Plot ax2 views distribution == == == == == == # for target_idx, views_list in enumerate(target_day_view_list): x_values = range(100) y_values = [np.percentile(views_list, x) for x in x_values] ax2.plot(x_values, y_values, color=color_cycle_4[target_idx], label=date_labels[target_idx]) ax2.set_yscale('log') ax2.set_xlabel('views percentile', fontsize=11) ax2.set_ylabel('num of views', fontsize=11) ax2.tick_params(axis='both', which='major', labelsize=10) ax2.set_title('(b) daily views vs. its percentile', fontsize=12) avg_views_list = sorted(list(embed_avg_view_dict.values()), reverse=True) gini_coef = gini(avg_views_list) print('top 1% videos occupy {0:.2f}% views'.format( sum(avg_views_list[:int(0.01 * num_videos)]) / sum(avg_views_list) * 100)) print('top 10% videos occupy {0:.2f}% views'.format( sum(avg_views_list[:int(0.1 * num_videos)]) / sum(avg_views_list) * 100)) print('Gini coef: {0:.3f}'.format(gini_coef)) spearman_degree = [ embed_avg_indegree_dict[embed] for embed in range(num_videos) ] spearman_views = [ embed_avg_view_dict[embed] for embed in range(num_videos) ] print( 'Spearman correlation between views and indegree: {0:.4f}, pvalue: {1:.2f}' .format(*spearmanr(spearman_views, spearman_degree))) median_views = np.median(avg_views_list) top_views_90th = np.percentile(avg_views_list, 90) top_views_99th = np.percentile(avg_views_list, 99) ax2_xmin = ax2.get_xlim()[0] ax2_ymin = ax2.get_ylim()[0] ax2.plot((50, 50), (ax2_ymin, median_views), color='k', linestyle='--', zorder=30) ax2.plot((ax2_xmin, 50), (median_views, median_views), color='k', linestyle='--', zorder=30) ax2.text(0.49, 0.45, 'median views {0:,.0f}'.format(median_views), size=11, transform=ax2.transAxes, ha='right', va='bottom') ax2.plot((90, 90), (ax2_ymin, top_views_90th), color='k', linestyle='--', zorder=30) ax2.plot((ax2_xmin, 90), (top_views_90th, top_views_90th), color='k', linestyle='--', zorder=30) ax2.text(0.88, 0.75, '90th views {0:,.0f}'.format(top_views_90th), size=11, transform=ax2.transAxes, ha='right', va='bottom') ax2.plot((99, 99), (ax2_ymin, top_views_99th), color='k', linestyle='--', zorder=30) ax2.plot((ax2_xmin, 99), (top_views_99th, top_views_99th), color='k', linestyle='--', zorder=30) ax2.text(0.91, 0.95, '99th views {0:,.0f}'.format(top_views_99th), size=11, transform=ax2.transAxes, ha='right', va='bottom') # == == == == == == Part 7: Plot ax3 video uploading trend == == == == == == # x_axis = range(2009, 2018) x_labels = ["'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16", "'17"] upload_mat = np.zeros((len(x_axis), 8)) target_topics = [ 'Pop_music', 'Rock_music', 'Hip_hop_music', 'Independent_music', 'Country_music', 'Electronic_music', 'Soul_music', 'Others' ] topic_labels = [ 'Pop', 'Rock', 'Hip hop', 'Independent', 'Country', 'Electronic', 'Soul', 'Others' ] color_cycle_8 = ColorPalette.CC8 data_loader.load_embed_content_dict() embed_title_dict = data_loader.embed_title_dict embed_uploadtime_dict = data_loader.embed_uploadtime_dict embed_genre_dict = data_loader.embed_genre_dict for embed in range(num_videos): upload_year = int(embed_uploadtime_dict[embed][:4]) if 2009 <= upload_year <= 2017: year_idx = upload_year - 2009 genres = embed_genre_dict[embed] if len(genres) == 0: # add one to "Others" genre upload_mat[year_idx, 7] += 1 else: for genre in genres: upload_mat[year_idx, target_topics.index(genre)] += 1 / len(genres) print() print([ '{0}: {1}'.format(topic, int(num)) for topic, num in zip(target_topics, np.sum(upload_mat, axis=0)) ]) stackedBarPlot(ax=ax3, data=upload_mat, cols=color_cycle_8, edgeCols=['#000000'] * 8, xlabel='uploaded year', ylabel='num of videos', scale=False, endGaps=True) ax3.tick_params(axis='both', which='major', labelsize=9) ax3.set_xticks(np.arange(len(x_axis))) ax3.set_xticklabels(x_labels) ax3.yaxis.set_major_formatter(FuncFormatter(concise_fmt)) ax3.legend([ plt.Rectangle((0, 0), 1, 1, fc=c, ec='k', alpha=0.6) for c in color_cycle_8 ], topic_labels, fontsize=9, frameon=False, handletextpad=0.2, columnspacing=0.3, ncol=4, bbox_to_anchor=(1, -0.12), bbox_transform=ax3.transAxes, fancybox=False, shadow=True) ax3.set_title('(c) VEVO videos uploading trend', fontsize=12) union_top_set = set(top_20_indegree_embeds).union(top_20_popular_embeds) print('\n>>> Size of the union set at cutoff 15:', len(union_top_set)) print('{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'. format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views', '-rank')) for embed in top_20_indegree_embeds: print( '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\' .format( embed_title_dict[embed].split( ' - ', 1)[1].split('(')[0].split('ft')[0].strip(), embed_title_dict[embed].split( ' - ', 1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format( (datetime(2018, 11, 2) - str2obj(embed_uploadtime_dict[embed])).days), '{0:,}'.format(int(embed_avg_indegree_dict[embed])), '{0:,}'.format(top_20_indegree_embeds.index(embed) + 1), '{0:,}'.format(int(embed_avg_view_dict[embed])), '{0:,}'.format(popular_ranked_embed_list.index(embed) + 1))) print('\n{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'. format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views', '-rank')) for embed in top_20_popular_embeds: print( '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\' .format( embed_title_dict[embed].split( ' - ', 1)[1].split('(')[0].split('ft')[0].strip(), embed_title_dict[embed].split( ' - ', 1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format( (datetime(2018, 11, 2) - str2obj(embed_uploadtime_dict[embed])).days), '{0:,}'.format(int(embed_avg_indegree_dict[embed])), '{0:,}'.format(indegree_ranked_embed_list.index(embed) + 1), '{0:,}'.format(int(embed_avg_view_dict[embed])), '{0:,}'.format(top_20_popular_embeds.index(embed) + 1))) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/measure_basic_statistics.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() app_name = 'cyberbullying' archive_dir = '../data/{0}_out'.format(app_name) entities = ['user', 'hashtag'] rho = 0.5272 fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) cc4 = ColorPalette.CC4 blue = cc4[0] for ax_idx, entity in enumerate(entities): sample_datefile = open(os.path.join( archive_dir, '{0}_{1}_all.txt'.format(entity, app_name)), 'r', encoding='utf-8') complete_datefile = open(os.path.join( archive_dir, 'complete_{0}_{1}.txt'.format(entity, app_name)), 'r', encoding='utf-8') sample_entity_freq_dict = defaultdict(int) complete_entity_freq_dict = defaultdict(int) uni_random_entity_freq_dict = defaultdict(int) if entity == 'user': for line in sample_datefile: sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1 for line in complete_datefile: complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1 toss = np.random.random_sample() if toss <= rho: uni_random_entity_freq_dict[line.rstrip().split(',') [1]] += 1 else: for line in sample_datefile: for item in line.rstrip().split(',')[1:]: sample_entity_freq_dict[item.lower()] += 1 for line in complete_datefile: for item in line.rstrip().split(',')[1:]: complete_entity_freq_dict[item.lower()] += 1 toss = np.random.random_sample() if toss <= rho: for item in line.rstrip().split(',')[1:]: uni_random_entity_freq_dict[item.lower()] += 1 sample_datefile.close() complete_datefile.close() # compute the powerlaw fit in the complete set complete_freq_list = list(complete_entity_freq_dict.values()) complete_powerlaw_fit = Fit(complete_freq_list) complete_alpha = complete_powerlaw_fit.power_law.alpha complete_xmin = complete_powerlaw_fit.power_law.xmin print('{0} complete set alpha {1}, xmin {2}'.format( entity, complete_alpha, complete_xmin)) plot_ccdf(complete_freq_list, ax=axes[ax_idx], color='k', ls='-', label='complete') # compute the powerlaw fit in the sample set # infer the number of missing entities sample_freq_list = list(sample_entity_freq_dict.values()) sample_freq_counter = Counter(sample_freq_list) # we observe the frequency of entities appearing less than 100 times num_interest = 100 sample_freq_list_top100 = [0] * num_interest for freq in range(1, num_interest + 1): sample_freq_list_top100[freq - 1] = sample_freq_counter[freq] inferred_num_missing = infer_missing_num(sample_freq_list_top100, rho=rho, m=num_interest) corrected_sample_freq_list = sample_freq_list + [ 0 ] * inferred_num_missing sample_powerlaw_fit = Fit(corrected_sample_freq_list) sample_alpha = sample_powerlaw_fit.power_law.alpha sample_xmin = sample_powerlaw_fit.power_law.xmin print('{0} sample set alpha {1}, xmin {2}'.format( entity, sample_alpha, sample_xmin)) plot_ccdf(corrected_sample_freq_list, ax=axes[ax_idx], color=blue, ls='-', label='sample') # compute the powerlaw fit in uniform random sample uni_random_num_missing = len(complete_entity_freq_dict) - len( uni_random_entity_freq_dict) uni_random_freq_list = list(uni_random_entity_freq_dict.values()) uni_random_freq_list = uni_random_freq_list + [ 0 ] * uni_random_num_missing uni_random_powerlaw_fit = Fit(uni_random_freq_list) uni_random_alpha = uni_random_powerlaw_fit.power_law.alpha uni_random_xmin = uni_random_powerlaw_fit.power_law.xmin print('{0} uniform random sampling alpha {1}, xmin {2}'.format( entity, uni_random_alpha, uni_random_xmin)) plot_ccdf(uni_random_freq_list, ax=axes[ax_idx], color='k', ls='--', label='uniform random') print('inferred missing', inferred_num_missing) print('empirical missing', len(complete_entity_freq_dict) - len(sample_entity_freq_dict)) print('uniform random missing', uni_random_num_missing) print('KS test (sample, uniform)') print(stats.ks_2samp(corrected_sample_freq_list, uni_random_freq_list)) print('KS test (sample, complete)') print(stats.ks_2samp(corrected_sample_freq_list, complete_freq_list)) print('KS test (uniform, complete)') print(stats.ks_2samp(uni_random_freq_list, complete_freq_list)) axes[ax_idx].set_xscale('symlog') axes[ax_idx].set_yscale('log') axes[ax_idx].set_xlabel('frequency', fontsize=16) axes[ax_idx].tick_params(axis='both', which='major', labelsize=16) axes[0].set_xticks([0, 1, 100, 10000]) axes[0].set_yticks([1, 0.01, 0.0001, 0.000001]) axes[0].set_ylabel('$P(X \geq x)$', fontsize=16) axes[0].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='lower left') axes[0].set_title('(a) user posting', fontsize=18, pad=-3 * 72, y=1.0001) axes[1].set_xticks([0, 1, 100, 10000, 1000000]) axes[1].set_yticks([1, 0.1, 0.001, 0.00001]) axes[1].set_title('(b) hashtag', fontsize=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/entity_freq_dist.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def plot_basics(data, data_inst, fig, units): from powerlaw import plot_pdf, Fit, pdf annotate_coord = (-.4, .95) ax1 = fig.add_subplot(n_graphs,n_data,data_inst) plot_pdf(data[data>0], ax=ax1, linear_bins=True, color='r', linewidth=.5) x, y = pdf(data, linear_bins=True) ind = y>0 y = y[ind] x = x[:-1] x = x[ind] ax1.scatter(x, y, color='r', s=.5) plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2) from pylab import setp setp( ax1.get_xticklabels(), visible=False) #ax1.set_xticks(ax1.get_xticks()[::2]) ax1.set_yticks(ax1.get_yticks()[::2]) locs,labels = yticks() #yticks(locs, map(lambda x: "%.0f" % x, log10(locs))) if data_inst==1: ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontsize=14) from mpl_toolkits.axes_grid.inset_locator import inset_axes ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3) ax1in.hist(data, normed=True, color='b') ax1in.set_xticks([]) ax1in.set_yticks([]) ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1) plot_pdf(data, ax=ax2, color='b', linewidth=2) fit = Fit(data, xmin=1, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g') p = fit.power_law.pdf() #ax2.set_ylim(min(p), max(p)) ax2.set_xlim(ax1.get_xlim()) fit = Fit(data, discrete=True) fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g') from pylab import setp setp( ax2.get_xticklabels(), visible=False) #ax2.set_xticks(ax2.get_xticks()[::2]) if ax2.get_ylim()[1] >1: ax2.set_ylim(ax2.get_ylim()[0], 1) ax2.set_yticks(ax2.get_yticks()[::2]) #locs,labels = yticks() #yticks(locs, map(lambda x: "%.0f" % x, log10(locs))) if data_inst==1: ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontsize=14) ax2.set_ylabel(r"$p(X)$")# (10^n)") ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst)#, sharex=ax1)#, sharey=ax2) fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g') fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r') fit.plot_pdf(ax=ax3, color='b', linewidth=2) #p = fit.power_law.pdf() ax3.set_ylim(ax2.get_ylim()) ax3.set_yticks(ax3.get_yticks()[::2]) ax3.set_xlim(ax1.get_xlim()) #locs,labels = yticks() #yticks(locs, map(lambda x: "%.0f" % x, log10(locs))) if data_inst==1: ax3.annotate("C", annotate_coord, xycoords="axes fraction", fontsize=14) #if ax2.get_xlim()!=ax3.get_xlim(): # zoom_effect01(ax2, ax3, ax3.get_xlim()[0], ax3.get_xlim()[1]) ax3.set_xlabel(units)
def out_degree(adj): return np.count_nonzero(adj, axis=1) funcs = [in_sum, out_sum, in_degree, out_degree] for i in range(len(GRAPH_TYPES)): g_type = GRAPH_TYPES[i] g_type_label = GRAPH_TYPE_LABELS[i] adj = load_everything(g_type, version=BRAIN_VERSION) for j in range(len(funcs)): vals = funcs[j](adj) vals = vals[vals > 0] plot_ccdf(data=vals, ax=axs[i, j]) results = Fit(vals) line = results.power_law.plot_ccdf( ax=axs[i, j], c="r", shift_by="original_data", linestyle="--", label="Power law", ) results.lognormal.plot_ccdf( ax=axs[i, j], c="g", shift_by="original_data", linestyle="--", label="Lognormal", )
class _Analyzer(ABC): def __init__(self, settings): self.sc = settings.ctrl self.sd = settings.data self.sa = settings.anal # TODO: factor setting of these boolean flags into own method if self.sa.txmin_map: self._use_pct_file = any('PCT' in col_hdr for col_hdr in self.sa.txmin_map.values()) self.rtn = Returns(settings) self.res = Results(settings) self._distros_to_compare = {'tpl': 'truncated_power_law', 'exp': 'exponential', 'lgn': 'lognormal'} # # # iteration state DEPENDENT (or aware) methods # # # def _log_curr_iter(self): # TODO: factor out repetitive log? (static: date, dynamic: group_label) gtyp, *date, tail = self.curr_iter_id grp_tail_log = (f"Analyzing {tail.name.upper()} tail of time series " f"for {self.sd.grouping_type.title()} '{gtyp}' ") if bool(date): # dynamic approach df = date[0] di = self.sa.get_dyn_lbd(df) # NOTE: di above is 1st date w/ price, not 1st date w/ return else: # static approach di, df = self.sd.date_i, self.sd.date_f date_log = f"b/w [{di}, {df}]" print(grp_tail_log + date_log) @abstractmethod def _set_curr_input_array(self): # NOTE: storage posn into results_df (curr_df_pos) also set here pass def __get_xmin(self): rule, qnty = self.sa.xmin_rule, self.sa.xmin_qnty if rule in {"clauset", "manual"}: xmin = qnty # ie. {None, user-input-ℝ} respectively elif rule == "percent": xmin = np.percentile(self.curr_signed_returns, qnty) elif rule == "std-dev": xmin = self.__calc_stdv_xmin(qnty) elif rule in {"file", "average"}: assert self.sa.use_dynamic,\ ("static approach does NOT currently support passing " "xmin data by file") # TODO: add file support for -a static? grp, date, tail = self.curr_iter_id txmin = self.sa.txmin_map[tail] xmin = qnty.loc[date, f"{txmin} {grp}"] if isinstance(xmin, str) and xmin.endswith("%"): # b/c values containing '%' in xmins_df must be str percent = float(xmin[:-1]) elif isinstance(xmin, (int, float)) and self._use_pct_file: if not (0 <= xmin <= 1): raise TypeError("xmin percentile threshold value for " f"{self.iter_id_keys} is outside of 0-100") percent = xmin * 100 else: pass # numerical xmin data reaches this branch try: xmin = np.percentile(self.curr_signed_returns, percent) except NameError: xmin = float(xmin) else: raise AttributeError("this should never be reached!") return xmin def __calc_stdv_xmin(self, factor): mean = st.fmean(self.curr_returns_array) stdv = st.stdev(self.curr_returns_array) *_, tail = self.curr_iter_id assert mean < factor * stdv return abs(mean + tail.value * factor * stdv) # tail.value ∈ {1, -1} def _fit_curr_data(self): data = self.curr_signed_returns data = data[np.nonzero(data)] # only use non-zero elements to do Fit xmin = self.__get_xmin() self.curr_fit = Fit(data=data, xmin=xmin, discrete=self.sa.fit_discretely) @staticmethod def gen_rmsf(mmt_func): # rmsf: Returns Moments Statistics Functions def mf_wrapped(mmt_func, rtrn_vec): try: return mmt_func(rtrn_vec) except st.StatisticsError: return np.nan return (mmt_func, lambda rv: mf_wrapped(mmt_func, rv[rv>0]), lambda rv: mf_wrapped(mmt_func, rv[rv<0])) def __get_curr_rtrn_stats(self): # NOTE: functions in below list must match order in output_columns.yaml rs_fns = (len, lambda r: np.count_nonzero(r == 0), np.count_nonzero, *_Analyzer.gen_rmsf(st.fmean), *_Analyzer.gen_rmsf(st.stdev), *_Analyzer.gen_rmsf(scipy.stats.skew), *_Analyzer.gen_rmsf(scipy.stats.kurtosis),) rstats_fmap = {self.sd.rstats_collabs[i]: rs_fns[i] for i in range(len(rs_fns))} return {rstat: rstats_fmap[rstat](self.curr_returns_array) for rstat in self.sd.rstats_collabs} def __get_curr_tail_stats(self): alpha, xmin, sigma = (getattr(self.curr_fit.power_law, prop) for prop in ('alpha', 'xmin', 'sigma')) elm_in_fit = self.curr_signed_returns >= xmin fitted_vec = self.curr_signed_returns[elm_in_fit] xmax = max(fitted_vec) xmean = fitted_vec.mean() xstdv = fitted_vec.std() abs_len = len(fitted_vec) if self.sa.run_ks_test is True: # TODO: try compute ks_pv using MATLAB engine & module, and time ks_pv, _ = plpva(self.curr_signed_returns, xmin, 'reps', self.sa.ks_iter, 'silent') locs = locals() return {('tail-statistics', stat): locs.get(stat) for st_type, stat in self.sd.tstats_collabs if stat in locs} def __get_curr_logl_stats(self): # compute (R, p)-pairs (x3) using powerlaw.Fit.distribution_compare logl_stats = {key: {stat: val for stat, val in zip(('R', 'p'), self.curr_fit.distribution_compare( 'power_law', distro, normalized_ratio=True))} for key, distro in self._distros_to_compare.items()} return {('log-likelihoods', f"{dist}_{st}"): val for dist, stats in logl_stats.items() for st, val in stats.items()} def __get_curr_plfit_stats(self): tail_stats = self.__get_curr_tail_stats() logl_stats = (self.__get_curr_logl_stats() if self.sa.compare_distros else {}) return {**tail_stats, **logl_stats} def __get_calcd_substats_map(self, sstype): idx, col = self.curr_df_pos # type(idx)==str; type(col)==tuple if sstype == 'plfit': stcalc_fn = self.__get_curr_plfit_stats top_grp = col if self.sa.use_dynamic else (col,) need_ss = self.sa.analyze_tails elif sstype == 'returns': stcalc_fn = self.__get_curr_rtrn_stats top_grp = ((col,) if not self.sa.analyze_tails else (col[0],) if self.sa.use_dynamic else ()) # NOTE: hasnans check below on (<col>, 'rtrn-stats') Rm's redundant # calc only works for 1-proc b/c multiproc only updts res_df at end rstat_uncalcd = self.res.df.loc[idx, top_grp + ('returns-statistics',)].hasnans need_ss = self.sa.calc_rtrn_stats and rstat_uncalcd return ({top_grp + tuple(ss_key): ss_val for ss_key, ss_val in stcalc_fn().items()} if need_ss else {}) def _gset_curr_partial_results(self, action): fstats_map = self.__get_calcd_substats_map('plfit') rstats_map = self.__get_calcd_substats_map('returns') # TODO: use np.ndarray instead of pd.Series (wasteful) --> order later curr_part_res_series = pd.Series({**fstats_map, **rstats_map}) idx, _ = self.curr_df_pos if action == 'store': self.res.df.loc[idx].update(curr_part_res_series) # TODO: consider using pd.DataFrame.replace(, inplace=True) instead # TODO: can also order stats results first, then assign to DF row elif action == 'return': return idx, curr_part_res_series # # # orchestration / driver methods # # # # convenience wrapper to keep things tidy def _run_curr_iter_fitting(self): self._log_curr_iter() self._set_curr_input_array() self._fit_curr_data() # runs analysis on data ID'd by the next iteration of the stateful iterator def _analyze_next(self): # TODO: combine _analyze_next & _analyze_iter?? self.curr_iter_id = next(self.iter_id_keys) # set in subclasses self._run_curr_iter_fitting() self._gset_curr_partial_results('store') # runs analysis from start to finish, in 1-process + single-threaded mode def analyze_sequential(self): while True: try: self._analyze_next() except StopIteration: break # runs analysis for one iteration of analysis given arbitrary iter_id def _analyze_iter(self, iter_id): # NOTE: use this to resume computation print(f"### DEBUG: PID {getpid()} analyzing iter {iter_id}", file=sys.stderr) self.curr_iter_id = iter_id self._run_curr_iter_fitting() return self._gset_curr_partial_results('return') # runs analysis in multiprocessing mode def analyze_multiproc(self): # TODO: https://stackoverflow.com/a/52596590/5437918 (use shared DBDFs) iter_id_keys = tuple(self.iter_id_keys) # TODO: look into Queue & Pipe for sharing data with Pool(processes=self.sc.nproc) as pool: # TODO checkout .map alternatives: .imap, .map_async, etc. restup_ls = [restup for restup in # TODO: optimize chunksize below pool.map(self._analyze_iter, iter_id_keys)] # TODO: update res_df more efficiently, ex. pd.df.replace(), np.ndarray for restup in restup_ls: idx, res = restup # if use '+' NOTE that DFs init'd w/ NaNs self.res.df.loc[idx].update(res) # top-level convenience method that autodetects how to run tail analysis def analyze(self): nproc = self.sc.nproc # TODO: add other conditions for analyze_sequential (ex. -a static) if nproc == 1: self.analyze_sequential() elif nproc > 1: self.analyze_multiproc() else: # if 0 or negative number of processors got through to here raise TypeError(f'Cannot perform analysis with {nproc} processes') def get_resdf(self): # TODO: final clean ups of DF for presentation: # - use .title() on all index labels, then write to file self.res.prettify_df() return self.res.df
def _fit_curr_data(self): data = self.curr_signed_returns data = data[np.nonzero(data)] # only use non-zero elements to do Fit xmin = self.__get_xmin() self.curr_fit = Fit(data=data, xmin=xmin, discrete=self.sa.fit_discretely)
def main(): """ Computes various graph statistics """ #Load subgraph here #G = nx.read_adjlist("data/sub_graph_networkx_graph") #G = G.to_directed() #Load graph #name = 'data/internal-references-pdftotext.json.gz' name = '../../data/internal-references-pdftotext.json.gz' q = ia.loaddata(fname=name) G = ia.makegraph(q) #basic stats N_nodes, N_edges = G.number_of_nodes(), G.number_of_edges() #Degree t1 = time.time() in_deg = [d for n, d in G.in_degree()] out_deg = [d for n, d in G.out_degree()] np.savetxt('../../data/in_degree.txt', in_deg) np.savetxt('../../data/out_degree.txt', out_deg) mean_k = 2 * np.mean(in_deg) t2 = time.time() print('degree took ' + str((t2 - t1) / 60.0) + ' mins') #Find powerlaw fits fit_in, fit_out = Fit(in_deg, xmin=0), Fit(out_deg, xmin=0) alpha_in = np.round(fit_in.power_law.alpha, 2) xmin_in = np.round(fit_in.power_law.xmin, 2) alpha_out = np.round(fit_out.power_law.alpha, 2) xmin_out = np.round(fit_out.power_law.xmin, 2) print('For power law fitting in-degree: x_min = ' + str(xmin_in)) print('For power law fitting out-degree: x_min = ' + str(xmin_out) + '\n') #Clustering coeff t1 = time.time() cs = list(nx.clustering(G).values()) np.savetxt('../../data/clustering_c.txt', cs) mean_C = np.round(np.mean(cs), 2) t2 = time.time() print('cluster coeff took ' + str((t2 - t1) / 60.0) + ' mins') #Size-biggest t1 = time.time() comps = nx.weakly_connected_components(G) biggest = max(comps, key=len) G_cc = G.subgraph(biggest) size_WCC = 1.0 * G_cc.number_of_nodes() fraction_WCC = np.round(size_WCC / N_nodes, 2) #Num isolated num_isolated = 0 comps = nx.weakly_connected_components(G) for cc in comps: if len(cc) == 1: num_isolated += 1 fraction_isolated = np.round((1.0 * num_isolated) / N_nodes, 2) t2 = time.time() print('cluster size dist ' + str((t2 - t1) / 60.0) + ' mins') #results stats = [ N_nodes, N_edges, mean_k, alpha_in, alpha_out, mean_C, fraction_WCC, fraction_isolated ] print(stats) #Stuff for tables OpenArXiv = ['openArXiv'] OpenArXiv.extend(map(lambda n: '{:.3f}'.format(n), stats)) # Automatically make table! datenow = str(datetime.now()).split()[0] with open('graph-stats-{}.tex'.format(datenow), 'w') as fout: fout.write(make_latex_table([Header, OpenArXiv, WoS, CiteSeer, ArXiv])) #### MAKE FIGURE tick_size = 20 axis_size = 30 label_size = 28 label_y_position = 1.10 inset_size = 18 plt.figure(figsize=(20, 5)) #Histogram in-degree n_bins = 30 ax1 = plt.subplot(131) plt.hist(in_deg, alpha=0.75, bins=n_bins) #plt.hist(out_deg, alpha=0.75,bins=n_bins) plt.xlabel('$k_{in}$', fontsize=axis_size) plt.xticks(fontsize=tick_size) plt.yticks(fontsize=tick_size) plt.rc('font', size=15) ax1.set_xscale('log') ax1.set_yscale('log') ax1.spines["top"].set_visible(False) ax1.spines["right"].set_visible(False) ax1.text(-0.025, label_y_position, 'a', transform=ax1.transAxes, fontsize=label_size, fontweight='bold', va='top', ha='right') ax1.text(0.9, 0.55, '', transform=ax1.transAxes, fontsize=inset_size, va='top', ha='right') #Histogram out-degree ax2 = plt.subplot(132) plt.hist(out_deg, alpha=0.75, bins=n_bins) plt.xlabel('$k_{out}$', fontsize=axis_size) plt.xticks(fontsize=tick_size) plt.yticks(fontsize=tick_size) plt.rc('font', size=15) ax2.set_xscale('log') ax2.set_yscale('log') ax2.spines["top"].set_visible(False) ax2.spines["right"].set_visible(False) ax2.text(-0.025, label_y_position, 'b', transform=ax2.transAxes, fontsize=label_size, fontweight='bold', va='top', ha='right') ax2.text(0.9, 0.55, '', transform=ax1.transAxes, fontsize=inset_size, va='top', ha='right') #Histogram clustering coefficients ax3 = plt.subplot(133) plt.hist(cs, alpha=0.75, bins=n_bins) ax3.set_xscale('log') ax3.set_yscale('log') plt.xlabel('$C$', fontsize=axis_size) plt.xticks(fontsize=tick_size) plt.yticks(fontsize=tick_size) plt.rc('font', size=15) ax3.spines["top"].set_visible(False) ax3.spines["right"].set_visible(False) ax3.text(-0.025, label_y_position, 'c', transform=ax3.transAxes, fontsize=label_size, fontweight='bold', va='top', ha='right') ax3.text(0.9, 0.55, '', transform=ax2.transAxes, fontsize=inset_size, va='top', ha='right') plt.tight_layout() if not os.path.exists('figures'): os.makedirs('figures') plt.savefig('figures/histograms_onerow-{}.pdf'.format(datenow))
def _get_fit_obj(self, data, xmin=None): # NOTE: only keep/use non-zero elements data = np.nonzero(data) # TODO: confirm data is always of np.ndarray discrete = False if self.ds.data_nature == 'continuous' else False xmin = self.__get_xmin(xmin=xmin, data=data) return Fit(data, discrete=discrete, xmin=xmin)