Exemple #1
0
 def close(self):
     scores10cdf = ecdf(self.scores10)
     scores50cdf = ecdf(self.scores50)
     scores90cdf = ecdf(self.scores90)
     
     line10 = XYPoints(scores10cdf[0], scores10cdf[1], '', 'k:')
     line50 = XYPoints(scores50cdf[0], scores50cdf[1], '', 'r')
     line90 = XYPoints(scores90cdf[0], scores90cdf[1], '', 'k-.')
     
     xy_plot(line10, line50, line90, 
              xlabel=r'Fraction of days since upload - $f$', 
              ylabel=r'Prob. (Fraction of days since upload $\leq$ f)', 
              legloc = 'lower center', legborder = False, legend=True,
              xmin=0.0, xmax=1.0,
              ymin=0.0, ymax=1.0,
              outputf=os.path.join(self.outf, 
                                   self.cname+'-105090-cdf.png')) 
     
     write_stats_to_file(self.scores10, 
                         os.path.join(self.outf, 
                                      self.cname+'-10percent.stats'))
     write_stats_to_file(self.scores50, 
                         os.path.join(self.outf, 
                                      self.cname+'-50percent.stats'))
     write_stats_to_file(self.scores90, 
                         os.path.join(self.outf, 
                                      self.cname+'-90percent.stats'))
     
     write_xy_to_file(scores10cdf[0], scores10cdf[1], 
                      os.path.join(self.outf, self.cname+'-10percent.cdf'))
     write_xy_to_file(scores50cdf[0], scores50cdf[1], 
                      os.path.join(self.outf, self.cname+'-50percent.cdf'))
     write_xy_to_file(scores90cdf[0], scores90cdf[1], 
                      os.path.join(self.outf, self.cname+'-90percent.cdf')) 
Exemple #2
0
    def real_close(self, tdata, append):
        frac_tf = []
        frac_ts = []
        frac_tt = []
        
        for tf, ts, tt in tdata:
            frac_tf.append(tf)
            frac_ts.append(ts)
            frac_tt.append(tt)
        
        tf_cdf = ecdf(frac_tf)
        ts_cdf = ecdf(frac_ts)
        tt_cdf = ecdf(frac_tt)
        
        line_tf = XYPoints(tf_cdf[0], tf_cdf[1],
                           r'Peak '+append, 'reducer-')
        line_ts = XYPoints(ts_cdf[0], ts_cdf[1],
                           r'$2^{nd}$ peak '+append, 'k--')
        line_tt = XYPoints(tt_cdf[0], tt_cdf[1],
                           r'$3^{rd}$ peak '+append, 'k:')
        
        xy_plot(line_tt, line_ts, line_tf, 
                 xlabel=r'Fraction of '+self.gname+' on peak '+append+' - $f$', 
                 ylabel=r'Prob. (Fraction of '+self.gname+' $\leq$ f)', 
                 legloc = 'lower right', 
                 legborder = False,
                 xmin=0.0, xmax=1.0,
                 ymin=0.0, ymax=1.0,
                 outputf=os.path.join(self.outf, 
                                        self.cname+'-'+append+'-cdf.png'))

        write_stats_to_file(frac_tf, 
                            os.path.join(self.outf, 
                                         self.cname+'-'+append+'-first.stats'))
        write_stats_to_file(frac_ts, 
                            os.path.join(self.outf, 
                                         self.cname+'-'+append+'-second.stats'))
        write_stats_to_file(frac_tt, 
                            os.path.join(self.outf, 
                                         self.cname+'-'+append+'-third.stats'))
        
        write_xy_to_file(tf_cdf[0], tf_cdf[1], 
                         os.path.join(self.outf,
                                      self.cname+'-'+append+'-first.cdf'))
        write_xy_to_file(ts_cdf[0], ts_cdf[1], 
                         os.path.join(self.outf,
                                      self.cname+'-'+append+'-second.cdf'))
        write_xy_to_file(tt_cdf[0], tt_cdf[1], 
                         os.path.join(self.outf, 
                                      self.cname+'-'+append+'-third.cdf'))
def main(tcu_fpath, col_name):
    data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
    
    column = data[col_name]
    dtype = column.dtype
    
    masked = None
    if dtype == 'i' or dtype == 'f':
        masked = ma.masked_invalid(column)
        
        x, y = ecdf(masked)
        plt.plot(x, y, 'bo')
        plt.show()
        
    else:
        #Simple hack for the string case. 
        #Creates a copy with masked values deleted. 
        masked = column[column != 'N/A']
        
        cat, y = categorical_hist(masked)
        
        x = range(1, len(cat) + 1)
        plt.bar(x, y, width = 0.5)
        plt.xticks(x, cat)
        plt.show()
Exemple #4
0
def four_plots(num_occur, fraction_repeat):
    
    num_occur = np.asanyarray(num_occur)
    fraction_repeat = np.asanyarray(fraction_repeat)
    
    cdf_x, cdf_y = ecdf(fraction_repeat)
    ccdf_y = 1 - cdf_y
    odds_ratio = cdf_y[ccdf_y != 0] / ccdf_y[ccdf_y != 0]

    plt.subplot(221)
    plt.plot(cdf_x, cdf_y)
    plt.xlabel('Fraction of Repeated')
    plt.ylabel('P(X < x)')
    
    plt.subplot(222)
    plt.plot(cdf_x, ccdf_y)
    plt.xlabel('Fraction of Repeated')
    plt.ylabel('P(X > x)')
    
    plt.subplot(223)
    plt.semilogy(cdf_x[ccdf_y != 0], odds_ratio)
    plt.xlabel('Fraction of Repeated')
    plt.ylabel('Odds Ratio: P(X < x) / P(X > x)')
    
    plt.subplot(224)
    plt.semilogx(num_occur, fraction_repeat, 'wo')
    plt.xlabel('# Occurrences')
    plt.ylabel('Fraction of Repeated')

    plt.tight_layout(pad=0)
    plt.show()
    plt.close()
Exemple #5
0
 def close(self):
     ecdf_points = ecdf(self.vals)
     line = XYPoints(ecdf_points[0], ecdf_points[1], 'All Events', 'k-')
     
     xy_plot(line, xlabel='Fraction of days since upload', 
             ylabel='Prob. (Fraction of days since upload $\leq$ x)',
             outputf=os.path.join(self.outf, 'evtime-cdf.png'),
             legloc = 'lower right', legborder = False, 
             xmin=0.0, xmax=1.0, ymin=0.0, ymax=1.0)
     
     write_stats_to_file(self.vals, os.path.join(self.outf, 'evtime.stats'))
     write_xy_to_file(ecdf_points[0], ecdf_points[1], 
                      os.path.join(self.outf, 'evtime-percent.cdf'))
    def close(self):
        style = VideoDAO.GROUP2STYLE
        
        lines_ktau = []
        for group in sorted(self.to_plot_ktau, 
                            key=lambda g: np.mean(self.to_plot_ktau[g]),
                            reverse=True):
            group_cdf = ecdf(self.to_plot_ktau[group])
            #1 minus for ccdf
            line = XYPoints(group_cdf[0], 1 - group_cdf[1], group, style[group])
            lines_ktau.append(line)
            fpath = os.path.join(self.out_folder_path, 'dups-ktau-%s.dat'%group)
            write_stats_to_file(self.to_plot_ktau[group], fpath)
            
        xy_plot(*lines_ktau,
              xlabel='Kendall Tau', 
              ylabel='Prob. (Kendall Tau > x)',
              grid=False,
              legborder=False,
              outputf=os.path.join(self.out_folder_path, 'dups-ktau.png'))

        lines_srho = []
        for group in sorted(self.to_plot_srho, 
                            key=lambda g: np.mean(self.to_plot_srho[g]),
                            reverse=True):
            group_cdf = ecdf(self.to_plot_srho[group])
            #1 minus for ccdf
            line = XYPoints(group_cdf[0], 1 - group_cdf[1], group, style[group])
            lines_srho.append(line)
            fpath = os.path.join(self.out_folder_path, 'dups-srho-%s.dat'%group)
            write_stats_to_file(self.to_plot_srho[group], fpath)

        xy_plot(*lines_srho,
              xlabel='Spearman Rho', 
              ylabel='Prob. (Spearman Rho > x)',
              grid=False,
              legborder=False,
              outputf=os.path.join(self.out_folder_path, 'dups-srho.png'))
Exemple #7
0
 def close(self):
     dat_cdf = ecdf(self.data)
     
     cdf_line = XYPoints(dat_cdf[0], dat_cdf[1], name=self.curve_label)
     xy_plot(cdf_line,
              xlabel=self.curve_label, 
              ylabel='Prob. ('+self.curve_label+' $\leq$ x)', 
              outputf=os.path.join(self.outf, self.cname+'-cdf.png'), 
              logx=self.logx, logy=self.logy)
     
     write_stats_to_file(self.data, 
                         os.path.join(self.outf, self.cname+'.dat'))
     write_xy_to_file(dat_cdf[0], dat_cdf[1], 
                      os.path.join(self.outf, self.cname+'.cdf'))
Exemple #8
0
 def close(self):
     view_comm_cdf = ecdf(self.view_comm)
     view_favs_cdf = ecdf(self.view_favs)
     comm_favs_cdf = ecdf(self.comm_favs)
     
     view_comm_cdf_line = XYPoints(view_comm_cdf[0], view_comm_cdf[1], 
                                   'Views x Comments')
     view_favs_cdf_line = XYPoints(view_favs_cdf[0], view_favs_cdf[1], 
                                   'Views x Favorites')
     comm_favs_cdf_line = XYPoints(comm_favs_cdf[0], comm_favs_cdf[1], 
                                   'Comments x Favorites')
     
     xy_plot(view_comm_cdf_line, view_favs_cdf_line, comm_favs_cdf_line, 
             xlabel=r'Pearson Correlation ($\rho$)', 
             ylabel=r'Prob. ($\rho \leq x$)', 
             outputf=os.path.join(self.outf, 'corr-vc-vf-vr-cdf.png'))
     
     xy_plot(view_comm_cdf_line, 
             xlabel=r'Pearson Correlation ($\rho$)', 
             ylabel=r'Prob. ($\rho \leq x$)', 
             outputf=os.path.join(self.outf, 'corr-view-comm-cdf.png'))
     xy_plot(view_favs_cdf_line, 
             xlabel=r'Pearson Correlation ($\rho$)', 
             ylabel=r'Prob. ($\rho \leq x$)', 
             outputf=os.path.join(self.outf, 'corr-view-favs-cdf.png'))
     xy_plot(comm_favs_cdf_line, 
             xlabel=r'Pearson Correlation ($\rho$)', 
             ylabel=r'Prob. ($\rho \leq x$)', 
             outputf=os.path.join(self.outf, 'corr-comm-favs-cdf.png'))
     
     write_stats_to_file(self.view_comm, 
                         os.path.join(self.outf, 'corr-view-comm.stats'))
     write_stats_to_file(self.view_favs, 
                         os.path.join(self.outf, 'corr-view-favs.stats'))
     write_stats_to_file(self.comm_favs, 
                         os.path.join(self.outf, 'corr-comm-favs.stats'))
    def close(self):
        ev2group = VideoDAO.EV2GROUP
        groups = VideoDAO.EV_GROUPS
        del groups['NOT_CAPTURED']

        vals_per_group = defaultdict(list)
        
        for t in self.vals:
            values = self.vals[t]
            vals_per_group[ev2group[t]].extend(values)
        
        styles = ['lightgrey'] * 7
        stylemap = dict(zip(sorted(groups), styles))

        boxes = []
        lines = []
        for group in sorted(groups):
            group_cdf = ecdf(vals_per_group[group])
            line = XYPoints(group_cdf[0], group_cdf[1], group)
            box = Box(vals_per_group[group], group, stylemap[group])

            lines.append(line)
            boxes.append(box)
            
            write_stats_to_file(vals_per_group[group], 
                                os.path.join(self.outf, group+'-views.stats'))
            write_xy_to_file(group_cdf[0], group_cdf[1], 
                             os.path.join(self.outf, group+'-views.cdf'))
        
        box_plot(*boxes,
                  xlabel='Referrer Category', ylabel='Fraction of Views',
                  ymin=-0.05, ymax=1.05, grid=False,
                  legborder=False, xmin=0.5, xmax=len(boxes) + 0.5, 
                  xrotation=20,
                  outputf=os.path.join(self.outf, 'ev-grouped-views-box.png'))
        
        xy_plot(*lines,
                  xlabel='Fraction of Views', ylabel='Prob. (Fraction of Views $\leq$ x)',
                  grid=False,
                  legborder=False,
                  xmin=0.0, xmax=1.0,
                  ymin=0.0, ymax=1.0,
                  logx=False, logy=True,
                  outputf=os.path.join(self.outf, 'ev-grouped-views-cdf.png'))
def main(fpath):
    doc_mat = vectorize_songs(fpath)[0]
    rows = doc_mat.nonzero()[0]

    to_plot = Counter(rows).values()
    x, cdf_y = ecdf(to_plot)
    ccdf_y = 1 - cdf_y

    print(stats.scoreatpercentile(to_plot, 0.1))
    print(doc_mat.shape)
    ax = plt.gca()
    ax.set_yscale("log")
    ax.set_xscale("log")

    plt.plot(x, ccdf_y, "bo")
    plt.xlabel("Number Tags per Song (x)")
    plt.ylabel("Prob(Num. Tags per Song > x)")
    plt.title("CCDF of Tags per Song")
    plt.show()
def main(fpath):
    doc_mat = vectorize_songs(fpath)[0]
    cols = doc_mat.nonzero()[1]
    
    to_plot = Counter(cols).values()
    x, cdf_y = ecdf(to_plot)
    ccdf_y = 1 - cdf_y
    
    print(stats.scoreatpercentile(to_plot, 0.5))
    print(doc_mat.shape)
    ax = plt.gca()
    ax.set_yscale('log')
    ax.set_xscale('log')

    plt.plot(x, ccdf_y, 'bo')
    plt.xlabel('Number of songs with tag (x)')
    plt.ylabel('Prob(Num. Songs with Tag > x)')
    plt.title('CCDF of Tag Popularity')
    plt.show()
    def close(self):
        groups = VideoDAO.EV_GROUPS.copy()
        del groups['NOT_CAPTURED']
        
        styles = ['lightgrey'] * 7
        stylemap = dict( zip(sorted(groups), styles) )

        boxes = []
        lines = []
        for group in sorted(groups):
            group_cdf = ecdf(self.vals[group])
            line = XYPoints(group_cdf[0], group_cdf[1], group)
            box = Box(self.vals[group], group, stylemap[group])

            lines.append(line)
            boxes.append(box)
            
            write_stats_to_file(self.vals[group], 
                                os.path.join(self.outf, group+'-time.stats'))
            write_xy_to_file(group_cdf[0], group_cdf[1], 
                             os.path.join(self.outf, group+'-time.cdf'))
        
        xy_plot(*lines,
                  xlabel='Fraction of days since upload', 
                  ylabel='Prob. (Fraction of days since upload $\leq$ x)',
                  legborder=False,
                  legloc='lower right',
                  xmin=0.0, xmax=1.0,
                  ymin=0.0, ymax=1.0,
                  outputf=os.path.join(self.outf, 'evtime-grouped-cdf.png'))
        
        box_plot(*boxes,
                  xlabel='Referrer Category', 
                  ylabel='Time Until First Referral (\% lifetime)',
                  ymin=-0.05, ymax=1.05, grid=False,
                  legborder=False, xmin=0.5, xmax=len(boxes) + 0.5,
                  outputf=os.path.join(self.outf, 'evtime-grouped-box.png'))
Exemple #13
0
def four_plots(data, data_name, fname):
    
    data = np.asanyarray(data) 
    fit = powerlaw.Fit(data, discrete=True, xmin=[1, 100])
    xmin = fit.xmin
    
    data_cut = data[data >= xmin]
    cdf_x, cdf_y = ecdf(data_cut)
    ccdf_y = 1 - cdf_y
    odds_ratio = cdf_y[ccdf_y != 0] / ccdf_y[ccdf_y != 0]

    log_min_size = np.log10(data_cut.min())
    log_max_size = np.log10(data_cut.max())
    nbins = np.ceil((log_max_size - log_min_size) * 10)
    bins = np.unique(np.floor(np.logspace(log_min_size, log_max_size, nbins)))
    hist, edges = np.histogram(data_cut, bins, density=True)
    bin_centers = (edges[1:] + edges[:-1]) / 2.0
    
    plt.subplot(131)
    plt.xlabel(data_name, labelpad=0)
    plt.ylabel(r'$p(X = x)$', labelpad=0)
    
    plt.loglog(bin_centers, hist, 'wo', ms=5)
    fit.power_law.plot_pdf(ax=plt.gca(), color='g', linestyle='-')
    fit.lognormal.plot_pdf(ax=plt.gca(), color='b', linestyle='--')
    fit.truncated_power_law.plot_pdf(ax=plt.gca(), color='r', linestyle=':')
    
    plt.subplot(132)
    plt.xlabel(data_name, labelpad=0)
    plt.ylabel(r'$P(X > x)$', labelpad=0)
    
    plt.loglog(cdf_x, ccdf_y, 'wo', ms=5, label='data', markevery=10)
    fit.power_law.plot_ccdf(ax=plt.gca(), color='g', linestyle='-.', label='powerlaw')
    fit.lognormal.plot_ccdf(ax=plt.gca(), color='b', linestyle='--', label='lognormal')
    fit.truncated_power_law.plot_ccdf(ax=plt.gca(), color='r', linestyle=':', label='powerlaw+cutoff')
    plt.legend(loc='lower left', frameon=False)
    
    plt.subplot(133)
    plt.xlabel(data_name, labelpad=0)
    plt.ylabel(r'Odds Ratio', labelpad=0)
    
    xvals = cdf_x[ccdf_y != 0]
    odds_plaw = fit.power_law.cdf(xvals) / fit.power_law.ccdf(xvals) 
    odds_lognorm = fit.lognormal.cdf(xvals) / fit.lognormal.ccdf(xvals) 
    odds_trunc = fit.truncated_power_law.cdf(xvals) / fit.truncated_power_law.ccdf(xvals) 

    plt.loglog(xvals, odds_ratio, 'wo', ms=5, markevery=10)
    plt.loglog(xvals, odds_plaw, 'g-')
    plt.loglog(xvals, odds_lognorm, 'b--')
    plt.loglog(xvals, odds_trunc, 'r:')
    
    plt.tight_layout(pad=0)
    plt.savefig(fname)
    plt.close()
    
    print(fname)
    for i in ['power_law', 'lognormal', 'truncated_power_law']:
        for j in ['power_law', 'lognormal', 'truncated_power_law']:
            if i != j:
                print(i, j)
                print(fit.distribution_compare(i, j))
                print()
                print()
    
    print('xmin', fit.xmin)
    
    d = fit.power_law
    print('Plaw - parameters D=', d.D)
    print('alpha', d.alpha)
    print()

    d = fit.lognormal
    print('Lognorm - parameters D=', d.D)
    print(d.parameter1_name, d.parameter1)
    print(d.parameter2_name, d.parameter2)
    print(d.parameter3_name, d.parameter3)
    print()

    d = fit.truncated_power_law
    print('ExpTrunc - parameters D=', d.D)
    print(d.parameter1_name, d.parameter1)
    print(d.parameter2_name, d.parameter2)
    print(d.parameter3_name, d.parameter3)
    print()
Exemple #14
0
    def close(self):
        #Plotting mean/median total differences
        scatter_avg_tot_diff = XYPoints(self.avgmedian_x, self.avg_y_tot_diff,
                                        style='bo')
        scatter_median_tot_diff = XYPoints(self.avgmedian_x, 
                                           self.median_y_tot_diff,
                                           style='bo')
        xy_plot(scatter_avg_tot_diff, 
                xlabel='Number of Final Views', 
                ylabel='Mean Total Difference', 
                outputf=os.path.join(self.outf, 'dups-mean-tot-diff.png'),
                logx=True, logy=True)
        xy_plot(scatter_median_tot_diff, 
                xlabel='Number of Final Views', 
                ylabel='Median Total Difference', 
                outputf=os.path.join(self.outf, 'dups-median-tot-diff.png'),
                logx=True, logy=True)

        #Plotting mean/median cosines                
        scatter_median_cos = XYPoints(self.avgmedian_x, self.median_y_cos,
                                         style='bo')
        scatter_avg_cos = XYPoints(self.avgmedian_x, self.avg_y_cos,
                                   style='bo')

        xy_plot(scatter_avg_cos, 
                xlabel='Number of Final Views', 
                ylabel='Mean (1 - Cosine)', 
                outputf=os.path.join(self.outf, 'dups-mean-cos.png'),
                logx=True, logy=True)
        xy_plot(scatter_median_cos, 
                xlabel='Number of Final Views', 
                ylabel='Median (1 - Cosine)', 
                outputf=os.path.join(self.outf, 'dups-median-cos.png'),
                logx=True, logy=True)

        #Plotting mean/median event differences                
        scatter_median_event_diff = XYPoints(self.avgmedian_x, 
                                             self.median_y_event_diff,
                                             style='bo')
        scatter_avg_event_diff = XYPoints(self.avgmedian_x, 
                                          self.avg_y_event_diff,
                                          style='bo')

        xy_plot(scatter_avg_event_diff, 
                xlabel='Number of Final Views', 
                ylabel='Mean Aggregate Event Difference', 
                outputf=os.path.join(self.outf, 'dups-mean-event-diff.png'),
                logx=True, logy=True)
        xy_plot(scatter_median_event_diff, 
                xlabel='Number of Final Views', 
                ylabel='Median Aggregate Event Difference', 
                outputf=os.path.join(self.outf, 'dups-median-event-diff.png'),
                logx=True, logy=True)
        
        #Pairwise plots
        scatter_pairwise_tot_diff = XYPoints(self.pairwise_x, 
                                             self.pairwise_y_tot_diff,
                                             style='bo')
        scatter_pairwise_event_diff = XYPoints(self.pairwise_x, 
                                               self.pairwise_y_event_diff,
                                               style='bo')
        scatter_pairwise_cos = XYPoints(self.pairwise_x, self.pairwise_y_cos,
                                        style='bo')
        xy_plot(scatter_pairwise_tot_diff, 
                xlabel='Number of Final Views', 
                ylabel='Total Difference', 
                outputf=os.path.join(self.outf, 'dups-pairwise-tot-diff.png'),
                logx=True, logy=True)
        xy_plot(scatter_pairwise_event_diff, 
                xlabel='Number of Final Views', 
                ylabel='Aggregate Event Difference', 
                outputf=os.path.join(self.outf, 'dups-pairwise-event-diff.png'),
                logx=True, logy=True)
        xy_plot(scatter_pairwise_cos, 
                xlabel='Number of Final Views', 
                ylabel='1 - Cosine', 
                outputf=os.path.join(self.outf, 'dups-paiwise-cos.png'),
                logx=True, logy=True)
        
        
        #Group plot
        lines = []
        style = VideoDAO.GROUP2STYLE
        for group in sorted(self.group_pairwise,
                            key=lambda g: np.mean(self.group_pairwise[g]),
                            reverse=True):
            group_cdf = ecdf(self.group_pairwise[group])
            #1 minus for ccdf
            line = XYPoints(group_cdf[0], 1 - group_cdf[1], group, style[group])
            fpath = os.path.join(self.outf, 'dups-diff-%s.dat'%group)
            write_stats_to_file(self.group_pairwise[group], fpath)
            lines.append(line)
        
        xy_plot(*lines,
                  xlabel='Views Difference', 
                  ylabel='Prob. (Views Difference > x)',
                  grid=False,
                  legborder=False,
                  logx=True, logy=False,
                  outputf=os.path.join(self.outf, 'dups-diff-grouped.png'))
        
        #CDF of mean cosine
        coscdf_cdf = ecdf(self.avg_y_cos)
        line = XYPoints(coscdf_cdf[0], 1 - coscdf_cdf[1], 
                        'CCDF Avg. Cosine', 'bo')
        xy_plot(line,
                  xlabel='Avg. Cosine', 
                  ylabel='Prob. (Avg. Cosine > x)',
                  grid=False,
                  legborder=False,
                  logx=True, logy=False,
                  outputf=os.path.join(self.outf, 'dups-ccdf-cos.png'))
                
        #CDF of aggregated
        aggcdf_cdf = ecdf(self.avg_y_event_diff)
        line = XYPoints(aggcdf_cdf[0], 1 - aggcdf_cdf[1], 
                        'Aggregate Event Difference', 'bo')
        xy_plot(line,
                  xlabel='CCDF Aggregate Event Difference', 
                  ylabel='Prob. (Avg. Agg. Diff > x)',
                  grid=False,
                  legborder=False,
                  logx=True, logy=False,
                  outputf=os.path.join(self.outf, 'dups-ccdf-agg.png'))