def close(self): scores10cdf = ecdf(self.scores10) scores50cdf = ecdf(self.scores50) scores90cdf = ecdf(self.scores90) line10 = XYPoints(scores10cdf[0], scores10cdf[1], '', 'k:') line50 = XYPoints(scores50cdf[0], scores50cdf[1], '', 'r') line90 = XYPoints(scores90cdf[0], scores90cdf[1], '', 'k-.') xy_plot(line10, line50, line90, xlabel=r'Fraction of days since upload - $f$', ylabel=r'Prob. (Fraction of days since upload $\leq$ f)', legloc = 'lower center', legborder = False, legend=True, xmin=0.0, xmax=1.0, ymin=0.0, ymax=1.0, outputf=os.path.join(self.outf, self.cname+'-105090-cdf.png')) write_stats_to_file(self.scores10, os.path.join(self.outf, self.cname+'-10percent.stats')) write_stats_to_file(self.scores50, os.path.join(self.outf, self.cname+'-50percent.stats')) write_stats_to_file(self.scores90, os.path.join(self.outf, self.cname+'-90percent.stats')) write_xy_to_file(scores10cdf[0], scores10cdf[1], os.path.join(self.outf, self.cname+'-10percent.cdf')) write_xy_to_file(scores50cdf[0], scores50cdf[1], os.path.join(self.outf, self.cname+'-50percent.cdf')) write_xy_to_file(scores90cdf[0], scores90cdf[1], os.path.join(self.outf, self.cname+'-90percent.cdf'))
def real_close(self, tdata, append): frac_tf = [] frac_ts = [] frac_tt = [] for tf, ts, tt in tdata: frac_tf.append(tf) frac_ts.append(ts) frac_tt.append(tt) tf_cdf = ecdf(frac_tf) ts_cdf = ecdf(frac_ts) tt_cdf = ecdf(frac_tt) line_tf = XYPoints(tf_cdf[0], tf_cdf[1], r'Peak '+append, 'reducer-') line_ts = XYPoints(ts_cdf[0], ts_cdf[1], r'$2^{nd}$ peak '+append, 'k--') line_tt = XYPoints(tt_cdf[0], tt_cdf[1], r'$3^{rd}$ peak '+append, 'k:') xy_plot(line_tt, line_ts, line_tf, xlabel=r'Fraction of '+self.gname+' on peak '+append+' - $f$', ylabel=r'Prob. (Fraction of '+self.gname+' $\leq$ f)', legloc = 'lower right', legborder = False, xmin=0.0, xmax=1.0, ymin=0.0, ymax=1.0, outputf=os.path.join(self.outf, self.cname+'-'+append+'-cdf.png')) write_stats_to_file(frac_tf, os.path.join(self.outf, self.cname+'-'+append+'-first.stats')) write_stats_to_file(frac_ts, os.path.join(self.outf, self.cname+'-'+append+'-second.stats')) write_stats_to_file(frac_tt, os.path.join(self.outf, self.cname+'-'+append+'-third.stats')) write_xy_to_file(tf_cdf[0], tf_cdf[1], os.path.join(self.outf, self.cname+'-'+append+'-first.cdf')) write_xy_to_file(ts_cdf[0], ts_cdf[1], os.path.join(self.outf, self.cname+'-'+append+'-second.cdf')) write_xy_to_file(tt_cdf[0], tt_cdf[1], os.path.join(self.outf, self.cname+'-'+append+'-third.cdf'))
def main(tcu_fpath, col_name): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) column = data[col_name] dtype = column.dtype masked = None if dtype == 'i' or dtype == 'f': masked = ma.masked_invalid(column) x, y = ecdf(masked) plt.plot(x, y, 'bo') plt.show() else: #Simple hack for the string case. #Creates a copy with masked values deleted. masked = column[column != 'N/A'] cat, y = categorical_hist(masked) x = range(1, len(cat) + 1) plt.bar(x, y, width = 0.5) plt.xticks(x, cat) plt.show()
def four_plots(num_occur, fraction_repeat): num_occur = np.asanyarray(num_occur) fraction_repeat = np.asanyarray(fraction_repeat) cdf_x, cdf_y = ecdf(fraction_repeat) ccdf_y = 1 - cdf_y odds_ratio = cdf_y[ccdf_y != 0] / ccdf_y[ccdf_y != 0] plt.subplot(221) plt.plot(cdf_x, cdf_y) plt.xlabel('Fraction of Repeated') plt.ylabel('P(X < x)') plt.subplot(222) plt.plot(cdf_x, ccdf_y) plt.xlabel('Fraction of Repeated') plt.ylabel('P(X > x)') plt.subplot(223) plt.semilogy(cdf_x[ccdf_y != 0], odds_ratio) plt.xlabel('Fraction of Repeated') plt.ylabel('Odds Ratio: P(X < x) / P(X > x)') plt.subplot(224) plt.semilogx(num_occur, fraction_repeat, 'wo') plt.xlabel('# Occurrences') plt.ylabel('Fraction of Repeated') plt.tight_layout(pad=0) plt.show() plt.close()
def close(self): ecdf_points = ecdf(self.vals) line = XYPoints(ecdf_points[0], ecdf_points[1], 'All Events', 'k-') xy_plot(line, xlabel='Fraction of days since upload', ylabel='Prob. (Fraction of days since upload $\leq$ x)', outputf=os.path.join(self.outf, 'evtime-cdf.png'), legloc = 'lower right', legborder = False, xmin=0.0, xmax=1.0, ymin=0.0, ymax=1.0) write_stats_to_file(self.vals, os.path.join(self.outf, 'evtime.stats')) write_xy_to_file(ecdf_points[0], ecdf_points[1], os.path.join(self.outf, 'evtime-percent.cdf'))
def close(self): style = VideoDAO.GROUP2STYLE lines_ktau = [] for group in sorted(self.to_plot_ktau, key=lambda g: np.mean(self.to_plot_ktau[g]), reverse=True): group_cdf = ecdf(self.to_plot_ktau[group]) #1 minus for ccdf line = XYPoints(group_cdf[0], 1 - group_cdf[1], group, style[group]) lines_ktau.append(line) fpath = os.path.join(self.out_folder_path, 'dups-ktau-%s.dat'%group) write_stats_to_file(self.to_plot_ktau[group], fpath) xy_plot(*lines_ktau, xlabel='Kendall Tau', ylabel='Prob. (Kendall Tau > x)', grid=False, legborder=False, outputf=os.path.join(self.out_folder_path, 'dups-ktau.png')) lines_srho = [] for group in sorted(self.to_plot_srho, key=lambda g: np.mean(self.to_plot_srho[g]), reverse=True): group_cdf = ecdf(self.to_plot_srho[group]) #1 minus for ccdf line = XYPoints(group_cdf[0], 1 - group_cdf[1], group, style[group]) lines_srho.append(line) fpath = os.path.join(self.out_folder_path, 'dups-srho-%s.dat'%group) write_stats_to_file(self.to_plot_srho[group], fpath) xy_plot(*lines_srho, xlabel='Spearman Rho', ylabel='Prob. (Spearman Rho > x)', grid=False, legborder=False, outputf=os.path.join(self.out_folder_path, 'dups-srho.png'))
def close(self): dat_cdf = ecdf(self.data) cdf_line = XYPoints(dat_cdf[0], dat_cdf[1], name=self.curve_label) xy_plot(cdf_line, xlabel=self.curve_label, ylabel='Prob. ('+self.curve_label+' $\leq$ x)', outputf=os.path.join(self.outf, self.cname+'-cdf.png'), logx=self.logx, logy=self.logy) write_stats_to_file(self.data, os.path.join(self.outf, self.cname+'.dat')) write_xy_to_file(dat_cdf[0], dat_cdf[1], os.path.join(self.outf, self.cname+'.cdf'))
def close(self): view_comm_cdf = ecdf(self.view_comm) view_favs_cdf = ecdf(self.view_favs) comm_favs_cdf = ecdf(self.comm_favs) view_comm_cdf_line = XYPoints(view_comm_cdf[0], view_comm_cdf[1], 'Views x Comments') view_favs_cdf_line = XYPoints(view_favs_cdf[0], view_favs_cdf[1], 'Views x Favorites') comm_favs_cdf_line = XYPoints(comm_favs_cdf[0], comm_favs_cdf[1], 'Comments x Favorites') xy_plot(view_comm_cdf_line, view_favs_cdf_line, comm_favs_cdf_line, xlabel=r'Pearson Correlation ($\rho$)', ylabel=r'Prob. ($\rho \leq x$)', outputf=os.path.join(self.outf, 'corr-vc-vf-vr-cdf.png')) xy_plot(view_comm_cdf_line, xlabel=r'Pearson Correlation ($\rho$)', ylabel=r'Prob. ($\rho \leq x$)', outputf=os.path.join(self.outf, 'corr-view-comm-cdf.png')) xy_plot(view_favs_cdf_line, xlabel=r'Pearson Correlation ($\rho$)', ylabel=r'Prob. ($\rho \leq x$)', outputf=os.path.join(self.outf, 'corr-view-favs-cdf.png')) xy_plot(comm_favs_cdf_line, xlabel=r'Pearson Correlation ($\rho$)', ylabel=r'Prob. ($\rho \leq x$)', outputf=os.path.join(self.outf, 'corr-comm-favs-cdf.png')) write_stats_to_file(self.view_comm, os.path.join(self.outf, 'corr-view-comm.stats')) write_stats_to_file(self.view_favs, os.path.join(self.outf, 'corr-view-favs.stats')) write_stats_to_file(self.comm_favs, os.path.join(self.outf, 'corr-comm-favs.stats'))
def close(self): ev2group = VideoDAO.EV2GROUP groups = VideoDAO.EV_GROUPS del groups['NOT_CAPTURED'] vals_per_group = defaultdict(list) for t in self.vals: values = self.vals[t] vals_per_group[ev2group[t]].extend(values) styles = ['lightgrey'] * 7 stylemap = dict(zip(sorted(groups), styles)) boxes = [] lines = [] for group in sorted(groups): group_cdf = ecdf(vals_per_group[group]) line = XYPoints(group_cdf[0], group_cdf[1], group) box = Box(vals_per_group[group], group, stylemap[group]) lines.append(line) boxes.append(box) write_stats_to_file(vals_per_group[group], os.path.join(self.outf, group+'-views.stats')) write_xy_to_file(group_cdf[0], group_cdf[1], os.path.join(self.outf, group+'-views.cdf')) box_plot(*boxes, xlabel='Referrer Category', ylabel='Fraction of Views', ymin=-0.05, ymax=1.05, grid=False, legborder=False, xmin=0.5, xmax=len(boxes) + 0.5, xrotation=20, outputf=os.path.join(self.outf, 'ev-grouped-views-box.png')) xy_plot(*lines, xlabel='Fraction of Views', ylabel='Prob. (Fraction of Views $\leq$ x)', grid=False, legborder=False, xmin=0.0, xmax=1.0, ymin=0.0, ymax=1.0, logx=False, logy=True, outputf=os.path.join(self.outf, 'ev-grouped-views-cdf.png'))
def main(fpath): doc_mat = vectorize_songs(fpath)[0] rows = doc_mat.nonzero()[0] to_plot = Counter(rows).values() x, cdf_y = ecdf(to_plot) ccdf_y = 1 - cdf_y print(stats.scoreatpercentile(to_plot, 0.1)) print(doc_mat.shape) ax = plt.gca() ax.set_yscale("log") ax.set_xscale("log") plt.plot(x, ccdf_y, "bo") plt.xlabel("Number Tags per Song (x)") plt.ylabel("Prob(Num. Tags per Song > x)") plt.title("CCDF of Tags per Song") plt.show()
def main(fpath): doc_mat = vectorize_songs(fpath)[0] cols = doc_mat.nonzero()[1] to_plot = Counter(cols).values() x, cdf_y = ecdf(to_plot) ccdf_y = 1 - cdf_y print(stats.scoreatpercentile(to_plot, 0.5)) print(doc_mat.shape) ax = plt.gca() ax.set_yscale('log') ax.set_xscale('log') plt.plot(x, ccdf_y, 'bo') plt.xlabel('Number of songs with tag (x)') plt.ylabel('Prob(Num. Songs with Tag > x)') plt.title('CCDF of Tag Popularity') plt.show()
def close(self): groups = VideoDAO.EV_GROUPS.copy() del groups['NOT_CAPTURED'] styles = ['lightgrey'] * 7 stylemap = dict( zip(sorted(groups), styles) ) boxes = [] lines = [] for group in sorted(groups): group_cdf = ecdf(self.vals[group]) line = XYPoints(group_cdf[0], group_cdf[1], group) box = Box(self.vals[group], group, stylemap[group]) lines.append(line) boxes.append(box) write_stats_to_file(self.vals[group], os.path.join(self.outf, group+'-time.stats')) write_xy_to_file(group_cdf[0], group_cdf[1], os.path.join(self.outf, group+'-time.cdf')) xy_plot(*lines, xlabel='Fraction of days since upload', ylabel='Prob. (Fraction of days since upload $\leq$ x)', legborder=False, legloc='lower right', xmin=0.0, xmax=1.0, ymin=0.0, ymax=1.0, outputf=os.path.join(self.outf, 'evtime-grouped-cdf.png')) box_plot(*boxes, xlabel='Referrer Category', ylabel='Time Until First Referral (\% lifetime)', ymin=-0.05, ymax=1.05, grid=False, legborder=False, xmin=0.5, xmax=len(boxes) + 0.5, outputf=os.path.join(self.outf, 'evtime-grouped-box.png'))
def four_plots(data, data_name, fname): data = np.asanyarray(data) fit = powerlaw.Fit(data, discrete=True, xmin=[1, 100]) xmin = fit.xmin data_cut = data[data >= xmin] cdf_x, cdf_y = ecdf(data_cut) ccdf_y = 1 - cdf_y odds_ratio = cdf_y[ccdf_y != 0] / ccdf_y[ccdf_y != 0] log_min_size = np.log10(data_cut.min()) log_max_size = np.log10(data_cut.max()) nbins = np.ceil((log_max_size - log_min_size) * 10) bins = np.unique(np.floor(np.logspace(log_min_size, log_max_size, nbins))) hist, edges = np.histogram(data_cut, bins, density=True) bin_centers = (edges[1:] + edges[:-1]) / 2.0 plt.subplot(131) plt.xlabel(data_name, labelpad=0) plt.ylabel(r'$p(X = x)$', labelpad=0) plt.loglog(bin_centers, hist, 'wo', ms=5) fit.power_law.plot_pdf(ax=plt.gca(), color='g', linestyle='-') fit.lognormal.plot_pdf(ax=plt.gca(), color='b', linestyle='--') fit.truncated_power_law.plot_pdf(ax=plt.gca(), color='r', linestyle=':') plt.subplot(132) plt.xlabel(data_name, labelpad=0) plt.ylabel(r'$P(X > x)$', labelpad=0) plt.loglog(cdf_x, ccdf_y, 'wo', ms=5, label='data', markevery=10) fit.power_law.plot_ccdf(ax=plt.gca(), color='g', linestyle='-.', label='powerlaw') fit.lognormal.plot_ccdf(ax=plt.gca(), color='b', linestyle='--', label='lognormal') fit.truncated_power_law.plot_ccdf(ax=plt.gca(), color='r', linestyle=':', label='powerlaw+cutoff') plt.legend(loc='lower left', frameon=False) plt.subplot(133) plt.xlabel(data_name, labelpad=0) plt.ylabel(r'Odds Ratio', labelpad=0) xvals = cdf_x[ccdf_y != 0] odds_plaw = fit.power_law.cdf(xvals) / fit.power_law.ccdf(xvals) odds_lognorm = fit.lognormal.cdf(xvals) / fit.lognormal.ccdf(xvals) odds_trunc = fit.truncated_power_law.cdf(xvals) / fit.truncated_power_law.ccdf(xvals) plt.loglog(xvals, odds_ratio, 'wo', ms=5, markevery=10) plt.loglog(xvals, odds_plaw, 'g-') plt.loglog(xvals, odds_lognorm, 'b--') plt.loglog(xvals, odds_trunc, 'r:') plt.tight_layout(pad=0) plt.savefig(fname) plt.close() print(fname) for i in ['power_law', 'lognormal', 'truncated_power_law']: for j in ['power_law', 'lognormal', 'truncated_power_law']: if i != j: print(i, j) print(fit.distribution_compare(i, j)) print() print() print('xmin', fit.xmin) d = fit.power_law print('Plaw - parameters D=', d.D) print('alpha', d.alpha) print() d = fit.lognormal print('Lognorm - parameters D=', d.D) print(d.parameter1_name, d.parameter1) print(d.parameter2_name, d.parameter2) print(d.parameter3_name, d.parameter3) print() d = fit.truncated_power_law print('ExpTrunc - parameters D=', d.D) print(d.parameter1_name, d.parameter1) print(d.parameter2_name, d.parameter2) print(d.parameter3_name, d.parameter3) print()
def close(self): #Plotting mean/median total differences scatter_avg_tot_diff = XYPoints(self.avgmedian_x, self.avg_y_tot_diff, style='bo') scatter_median_tot_diff = XYPoints(self.avgmedian_x, self.median_y_tot_diff, style='bo') xy_plot(scatter_avg_tot_diff, xlabel='Number of Final Views', ylabel='Mean Total Difference', outputf=os.path.join(self.outf, 'dups-mean-tot-diff.png'), logx=True, logy=True) xy_plot(scatter_median_tot_diff, xlabel='Number of Final Views', ylabel='Median Total Difference', outputf=os.path.join(self.outf, 'dups-median-tot-diff.png'), logx=True, logy=True) #Plotting mean/median cosines scatter_median_cos = XYPoints(self.avgmedian_x, self.median_y_cos, style='bo') scatter_avg_cos = XYPoints(self.avgmedian_x, self.avg_y_cos, style='bo') xy_plot(scatter_avg_cos, xlabel='Number of Final Views', ylabel='Mean (1 - Cosine)', outputf=os.path.join(self.outf, 'dups-mean-cos.png'), logx=True, logy=True) xy_plot(scatter_median_cos, xlabel='Number of Final Views', ylabel='Median (1 - Cosine)', outputf=os.path.join(self.outf, 'dups-median-cos.png'), logx=True, logy=True) #Plotting mean/median event differences scatter_median_event_diff = XYPoints(self.avgmedian_x, self.median_y_event_diff, style='bo') scatter_avg_event_diff = XYPoints(self.avgmedian_x, self.avg_y_event_diff, style='bo') xy_plot(scatter_avg_event_diff, xlabel='Number of Final Views', ylabel='Mean Aggregate Event Difference', outputf=os.path.join(self.outf, 'dups-mean-event-diff.png'), logx=True, logy=True) xy_plot(scatter_median_event_diff, xlabel='Number of Final Views', ylabel='Median Aggregate Event Difference', outputf=os.path.join(self.outf, 'dups-median-event-diff.png'), logx=True, logy=True) #Pairwise plots scatter_pairwise_tot_diff = XYPoints(self.pairwise_x, self.pairwise_y_tot_diff, style='bo') scatter_pairwise_event_diff = XYPoints(self.pairwise_x, self.pairwise_y_event_diff, style='bo') scatter_pairwise_cos = XYPoints(self.pairwise_x, self.pairwise_y_cos, style='bo') xy_plot(scatter_pairwise_tot_diff, xlabel='Number of Final Views', ylabel='Total Difference', outputf=os.path.join(self.outf, 'dups-pairwise-tot-diff.png'), logx=True, logy=True) xy_plot(scatter_pairwise_event_diff, xlabel='Number of Final Views', ylabel='Aggregate Event Difference', outputf=os.path.join(self.outf, 'dups-pairwise-event-diff.png'), logx=True, logy=True) xy_plot(scatter_pairwise_cos, xlabel='Number of Final Views', ylabel='1 - Cosine', outputf=os.path.join(self.outf, 'dups-paiwise-cos.png'), logx=True, logy=True) #Group plot lines = [] style = VideoDAO.GROUP2STYLE for group in sorted(self.group_pairwise, key=lambda g: np.mean(self.group_pairwise[g]), reverse=True): group_cdf = ecdf(self.group_pairwise[group]) #1 minus for ccdf line = XYPoints(group_cdf[0], 1 - group_cdf[1], group, style[group]) fpath = os.path.join(self.outf, 'dups-diff-%s.dat'%group) write_stats_to_file(self.group_pairwise[group], fpath) lines.append(line) xy_plot(*lines, xlabel='Views Difference', ylabel='Prob. (Views Difference > x)', grid=False, legborder=False, logx=True, logy=False, outputf=os.path.join(self.outf, 'dups-diff-grouped.png')) #CDF of mean cosine coscdf_cdf = ecdf(self.avg_y_cos) line = XYPoints(coscdf_cdf[0], 1 - coscdf_cdf[1], 'CCDF Avg. Cosine', 'bo') xy_plot(line, xlabel='Avg. Cosine', ylabel='Prob. (Avg. Cosine > x)', grid=False, legborder=False, logx=True, logy=False, outputf=os.path.join(self.outf, 'dups-ccdf-cos.png')) #CDF of aggregated aggcdf_cdf = ecdf(self.avg_y_event_diff) line = XYPoints(aggcdf_cdf[0], 1 - aggcdf_cdf[1], 'Aggregate Event Difference', 'bo') xy_plot(line, xlabel='CCDF Aggregate Event Difference', ylabel='Prob. (Avg. Agg. Diff > x)', grid=False, legborder=False, logx=True, logy=False, outputf=os.path.join(self.outf, 'dups-ccdf-agg.png'))