def draw_networkx_phrases(n, t): fn, url = util.get_web_fn("graphs", "networks", '%s_%s.png' % (n, t)) phrases = {} for phrase, metrics in util.load_top_phrases(n, t=t): phrases[phrase] = metrics G, G2 = gen_network(phrases, n) pos = nx.spring_layout(G) # positions for all nodes elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > 0.5] esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= 0.5] # nodes nx.draw_networkx_nodes(G, pos, node_size=70) # edges nx.draw_networkx_edges(G, pos, edgelist=elarge, width=2) #nx.draw_networkx_edges(G, pos, edgelist=esmall,width=2, alpha=0.5, edge_color='b', style='dashed') # labels nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif') plt.axis('off') plt.savefig(fn) print("Published to: %s " % url)
def get_data_dump_url(folders, basename, **kwargs): """ Has the same call signature and return value as create_data_dump, so can be a drop-in replacement for testing """ nwebfn, nurl = util.get_web_fn(*folders, "%s.html" % (basename)) return nurl
def get_correlation(fn): nh_vals = get_originator_nh_vals(fn) o_vals, o_doms = zip(*get_originator_vals(fn)) rank_num = list(range(min(len(o_vals), cap, len(nh_vals)))) fn, url = util.get_web_fn("graphs", "originator_nh_correlations", "%s%s.html" % (os.path.basename(fn), capN)) corr = stats.pearsonr(o_vals[:len(rank_num)], nh_vals[:len(rank_num)]) if True: #not os.path.exists(fn): fig = go.Figure( data=go.Scatter(x=rank_num, y=o_vals[:len(rank_num)], text=o_doms, mode='lines', name="Originator"), layout=go.Layout( title="Originator vs Null Hypothesis (Pearson R=%f, p=%f)" % corr, xaxis_title="Domains", yaxis_title="Influence score")) fig.add_trace( go.Scatter(x=rank_num, y=nh_vals[:len(rank_num)], mode='lines', name='Null Hypothesis')) plotly.offline.plot(fig, filename=fn, auto_open=False) print("Published at %s" % url) return corr
def draw_html_heatmap_from_data(n, phrase, ar, l): opts = np.get_printoptions() np.set_printoptions(threshold=sys.maxsize) ar, l = sort_row_data(ar, l) ar = list(reversed(ar)) l = list(reversed(l)) fn, url = util.get_web_fn("graphs", "html_heatmaps", slugify(phrase)) yss = [ys for ys in util.iter_yearseason()] if len(ar) == 0: sys.err.writeln("%s,%s has no values\n" % (n, phrase)) return url with open(fn, "w+") as f: f.write("""<html> <head> <style> table, th, td { border: 1px solid black; border-collapse: collapse; } tr:nth-child(even) {background-color: #d2d2d2; </style> </head> <body> """) f.write('<h3>Phrase: %s</h3>' % (phrase, )) f.write( '<h4><a href="%s" target="_blank">Graphical Heatmap</a></h4>\n' % (get_heatmap_url(n, phrase), )) f.write('<table>\n') headers = ["Domain"] + list(util.iter_yearseason()) f.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers)) for domId in range(len(ar)): f.write('<tr>\n') f.write('<td>%s</td>\n' % l[domId]) for ysId in range(len(ar[0])): if ar[domId][ysId] == 1: policy_url = "https://privacypolicies.cs.princeton.edu/fetch_policy.php?domain=%s&interval=%s_%s" % ( l[domId], yss[ysId][:4], yss[ysId][4]) #Policy URL f.write('<td><a href="%s">%s</a></td>\n' % (policy_url, yss[ysId])) else: f.write('<td></td>\n') f.write('</table>\n') f.write("</body></html>") np.set_printoptions(opts) return url
def draw_heatmap_from_data(n, phrase, ar, l): opts = np.get_printoptions() np.set_printoptions(threshold=sys.maxsize) ar, l = sort_row_data(ar, l) fn, url = util.get_web_fn("graphs", "heatmaps", slugify(phrase)) yss = [ys for ys in util.iter_yearseason()] fig = go.Figure(data=go.Heatmap(z=ar, y=l, x=yss, colorscale=[(0, "#d0d3d4"), (1, "#1a5276")], showscale=False)) plotly.offline.plot(fig, filename=fn, auto_open=False) np.set_printoptions(opts) return url
def get_correlation(fn): a_ranks = get_alexa_rankings() o_ranks = get_originator_rankings(fn) a_rank_num = [a_ranks[dom] for dom in o_ranks] o_rank_num = list(range(1,len(o_ranks)+1)) fn, url = util.get_web_fn("graphs", "originator_correlations", "%s.html" % os.path.basename(fn)) if True:#not os.path.exists(fn): fig = go.Figure(data=go.Scatter(x=o_rank_num, y=o_rank_num, text=o_ranks, mode='lines',name="Originator Rank")) fig.add_trace(go.Scatter(x=o_rank_num, y=a_rank_num, text=o_ranks, mode='lines', name='Alexa Rank')) plotly.offline.plot(fig, filename=fn, auto_open=False) print("Published at %s" % url) return stats.pearsonr(a_rank_num, o_rank_num)
def get_correlation(fn1, fn2): o1_ranks = get_originator_rankings(fn1) o2_ranks = get_originator_rankings(fn2) if len(o1_ranks) > len(o2_ranks): tmp = o1_ranks, fn1 o1_ranks = o2_ranks fn1 = fn2 o2_ranks, fn2 = tmp print("Swapped") o2_rank_dict = {o2_ranks[i]: i + 1 for i in range(len(o2_ranks))} o1_rank_num = list(range(1, len(o1_ranks) + 1)) o2_rank_num = [ o2_rank_dict[d] if d in o2_rank_dict else len(o2_rank_dict) for d in o1_ranks ] fn, url = util.get_web_fn( "graphs", "co_originator_correlations", "%s-%s.html" % (os.path.basename(fn1), os.path.basename(fn2))) if True: #not os.path.exists(fn): fig = go.Figure(data=go.Scatter(x=list(range(len(o1_rank_num))), y=o1_rank_num, text=o1_ranks, mode='lines', name=os.path.basename(fn1))) fig.add_trace( go.Scatter(x=list(range(len(o2_rank_num))), y=o2_rank_num, text=o1_ranks, mode='lines', name=os.path.basename(fn2))) plotly.offline.plot(fig, filename=fn, auto_open=False) print("Published at %s" % url) print(len(o1_rank_num), len(o2_rank_num)) print(o1_rank_num) print(o2_rank_num) return stats.pearsonr(o1_rank_num, o2_rank_num)
def draw_lines(filename, topX, freq_data=None): """ Frequency data is an array of tuples. The first item is the phrase, the second item is the number of occurances at each interval """ opts = np.get_printoptions() np.set_printoptions(threshold=sys.maxsize) fn, url = util.get_web_fn('graphs', "lines", '%s.html' % os.path.basename(filename)) if os.path.exists(fn): # print("Skipping line drawing") return url yss = [ys for ys in util.iter_yearseason()] if freq_data is None: freq_data = get_frequency_data(filename, maxCount=topX) freq_data = list(freq_data) fig = go.Figure() # layout=go.Layout( # legend=dict(x=-.1, y=-0.5*(len(freq_data) // 10)) # ) for label, data in freq_data: #wrappedlabel = textwrap.fill(label, 40) if len(label) >= 40: wrappedlabel = label[:37] + "..." else: wrappedlabel = label fig.add_trace( go.Scatter(x=yss, y=data, mode='lines', name=wrappedlabel)) plotly.offline.plot(fig, filename=fn, auto_open=False) np.set_printoptions(opts) return url
def do_analytics(gram_list): global SLOPE_COUNT_THRESH, SLOPE_PSEUDOCOUNT webfn, weburl = util.get_web_fn("raw", "analytics.html") logging.info("Access at %s" % weburl) #logging.info("Pre-loading Alexa results") #alexa.load() logging.info("Starting Counting intervals") num_intervals = len(list(util.iter_yearseason())) logging.info("Starting counting policies") domain_counts, policy_counts = get_policy_counts() average_policy_count = sum(policy_counts.values()) / len(policy_counts) policy_norm_factor = { yearseason: 1 / policy_counts[yearseason] for yearseason in util.iter_yearseason() } average_domain_count = sum(domain_counts.values()) / len(domain_counts) domain_norm_factor = { yearseason: 1 / domain_counts[yearseason] for yearseason in util.iter_yearseason() } with open(webfn, "w+") as webf: webf.write(""" <html> <head> <link rel="stylesheet" type="text/css" href="/styles/style.css"/> </head> <body> """) webf.write(""" <div class="query"> <iframe id="queryframe" src="https://privacypolicies.cs.princeton.edu/search_policies.php?basedir=%s"> </iframe> </div> <div class="metricsd"><table class="metricst"> """ % util.testing_prefix) #Iterate phrase type for n in gram_list: ngram_name = "%s-grams" % n if n not in friendly_names else friendly_names[ n] webf.write(""" <tr class="nhtr"> <td><h3>Data for %s</h3></td> </tr> <tr class="ntr">""" % ngram_name) logging.info("Loading top grams for %s-grams" % (n)) gram_freq_by_countfxn = load_top_grams(n, domain_norm_factor, policy_norm_factor, domain_counts) logging.info("Done loading top grams for %s-grams" % (n)) mem_trace() if gram_freq_by_countfxn is None: logging.error("No grams") continue gram_freq_raw = gram_freq_by_countfxn["raw"] #Iterate counting methods i.e. unique, total, or alexa-weighted domains for count_name, (countf, count_friendly_name) in util.count_fxns.items(): logging.info("\tBeginning scoring with %s" % (count_name)) #Score based on various metrics gram_freq = gram_freq_by_countfxn[count_name] base_count = None if count_name == "total": base_count = 1 / average_domain_count elif count_name == "unique": base_count = 1 / average_policy_count elif count_name == "alexa": base_count = alexa.average_traffic slope_count_thresh = get_slope_thresh(gram_freq, base_count) slope_pseudocount = base_count logging.info("\tSlope thresh for round is: %0.4E" % slope_count_thresh) logging.info("\tSlope pseudocount for round is: %0.4E" % slope_pseudocount) gram_scores = {mname: [] for mname in metrics} #Identifying phrases of interest based on metrics & rules for s in gram_freq: vals = gram_freq[s] for (mname, (score_fxn, hname)) in metrics.items(): heap = gram_scores[mname] score = score_fxn(vals, slope_pseudocount, slope_count_thresh) if score == -100000: continue if len(heap) >= topN: heapq.heappushpop(heap, (score, s)) else: heapq.heappush(heap, (score, s)) logging.info("\tDone scoring") webf.write('<td class="dataCb">\n') webf.write("<h4>Counted by %s</h4>\n" % (count_friendly_name)) score_name_str = "Top %s by %%s counted by %s</h4>\n" % ( ngram_name, count_friendly_name) #Dump data for mname, (fsc, hname) in metrics.items(): heap = gram_scores[mname] logging.info("\tSorting top values for %s" % (mname)) #Heaps aren't sorted, we need to sort the heap #Taking advantage of the heap structure doesn't help us here... pop is log(n), and we need n iterations heap = sorted(heap, reverse=True) phrases = [s for sc, s in heap] groups = [ phrases[10 * i:min(10 * (i + 1), len(phrases))] for i in range(math.ceil(len(phrases) / 10)) ] scores = {s: sc for sc, s in heap} logging.info("\tStarting data dump for %s" % (mname)) nwebfn, nurl = create_data_dump( n, ["metrics", mname], "%s-grams_top_%s_%s%s" % (n, mname, count_name, util.get_file_suffix()), groups, gram_freq, gram_freq_raw, score_name_str % hname, scores=scores) webf.write( '<span style="margin-left:2em"><a href="%s" target="_blank">%s</a></span></br>\n' % (nurl, hname)) print("\033[K\t\tGraphs created", end="\r") webf.flush() webf.write('<br/></td>\n') #Best to force a flush for partial readouts webf.flush() os.fsync(webf) mem_trace() print("\033[K", end="\r") logging.info("\tDone dumping data") mem_count() webf.write('</tr>') util.close_pool() webf.write(""" </table> </div> </body> </html>""") logging.info("Done")
def create_data_dump(n, folders, basename, groups, gram_freq, gram_freq_raw, score_name_str, scores=None, gen_figs=False): """ Take the given phrases and create: - A table of phrase, score (if available), occurances - Line graphs for phrase occurance - Heatmaps for phrases """ nwebfn, nurl = util.get_web_fn(*folders, "%s.html" % (basename)) group_to_graph = {} hit_to_graph = {} #Part one -- create CSV data files, heatmaps, and line graphs for group_num in range(len(groups)): group_hits = groups[group_num] nfn = os.path.join(util.OUT_DIR, "%s_%d.csv" % (basename, group_num)) if gen_figs: #Line graph line_url = draw_lines.draw_lines(nfn, -1, ((hit, gram_freq[hit]) for hit in group_hits)) logging.info("\t\tDone drawling lines for group %d" % (group_num)) #Heatmp if n == 'w': html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls( n, group_hits) else: heatmap_urls, html_heatmap_urls = draw_heatmap.draw_html_and_reg_heatmaps( n, group_hits, gram_freq_raw) logging.info("\033[K\t\tDone drawing heatmaps for group %d" % (group_num)) else: #Skip drawing line_url = draw_lines.get_lines_url(nfn, -1) heatmap_urls = draw_heatmap.get_multiple_heatmap_urls( n, group_hits) html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls( n, group_hits) group_to_graph[group_num] = line_url with open(nfn, "w+") as f: #Write CSV data file for later use writer = csv.writer(f) for hit in group_hits: vals_str = ["%0.2f" % f for f in gram_freq[hit]] writer.writerow([ hit.replace("\\", "\\\\").replace("\n", "\\n"), *vals_str ]) #heatmap_url = heatmap_urls[hit] heatmap_url = html_heatmap_urls[hit] # print(heatmap_url) hit_to_graph[hit] = heatmap_url logging.info("\t\tDone drawing group %d" % (group_num)) #Part two -- create usable web page to explore with open(nwebfn, "w+") as nwebf: nwebf.write("""<html> <head> <style> table, th, td { border: 1px solid black; border-collapse: collapse; } </style> </head> <body> """) pool = util.get_pool() all_hits = list( itertools.chain(*[[hit for hit in group] for group in groups])) plots = pool.starmap(miniplot.get_plot_as_img, [(ys_list, gram_freq[hit]) for hit in all_hits]) plots = list(plots) logging.info("\t\tDone drawing plots") #Break up by group pNum = 0 for group, gid in zip(groups, range(len(groups))): if len(group) == 0: continue line_graph_url = group_to_graph[gid] nwebf.write('<h3>%s</h3>' % (score_name_str)) nwebf.write( '<h4><a href="%s" target="_blank">Line graph for group %d</a></h4>\n' % (line_graph_url, gid)) nwebf.write('<table>\n') headers = ["Score", "Phrase", "Bar Plot"] + list( util.iter_yearseason()) if scores is None: headers = headers[1:] nwebf.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers)) #Then phrase for hit in group: nwebf.write('<tr>\n') if scores is not None: if abs(scores[hit]) < 0.01 and scores[hit] != 0: nwebf.write('<td>%0.2E</td>' % scores[hit]) else: nwebf.write('<td>%0.2f</td>' % scores[hit]) nwebf.write('<td width="30%">\n') #vals_str = ["%0.2f" % f for f in gram_freq[hit]] vals_str = [ ("%0.2E" if (abs(gram_freq[hit][ysid]) != 0 and abs(gram_freq[hit][ysid]) < 0.01) else "%0.2f") % (gram_freq[hit][ysid]) for ysid in range(num_intervals) ] heatmap_graph_url = hit_to_graph[hit] hit_link = '<a href="%s" target="_blank">%s</a>' % ( heatmap_graph_url, hit.replace("\\", "\\\\").replace( "\n", "\\n")) plot = plots[ pNum] # miniplot.get_plot_as_img(ys_list,gram_freq[hit]) pNum += 1 nwebf.write('</td><td>'.join([hit_link, plot, *vals_str])) #TODO add miniplot nwebf.write('</td></tr>\n') logging.info("\t\tDone with table for group %d" % (gid)) nwebf.write('</table>\n') nwebf.write("</body></html>") mem_count() return nwebfn, nurl
def get_html_heatmap_loc(n, phrase): """ Placeholder for faster runtime until we have a more efficient heatmap generator """ fn, url = util.get_web_fn("graphs", "html_heatmaps", slugify(phrase)) return fn, url
def do_analytics(): global SLOPE_COUNT_THRESH gram_list = list(range(start,stop)) if SENTENCES: gram_list.append("s") if ENTITIES: gram_list.append("e") gram_list.append("u") gram_list.append("m") #gram_list.append("n") if WORDS: gram_list.append("w") webfn, weburl = util.get_web_fn("raw", "analytics.html") print("\033[KAccess at %s" % weburl) with open(webfn, "w+") as webf: webf.write(""" <html> <head> <link rel="stylesheet" type="text/css" href="/styles/style.css"/> </head> <body> """) webf.write(""" <div class="query"> <iframe id="queryframe" src="https://privacypolicies.cs.princeton.edu/search_policies.php?basedir=%s"> </iframe> </div> <div class="metricsd"><table class="metricst"> """ % util.testing_prefix) #Iterate phrase type for n in gram_list: ngram_name = "%s-grams" % n if n not in friendly_names else friendly_names[n] webf.write(""" <tr class="nhtr"> <td><h3>Data for %s</h3></td> </tr> <tr class="ntr">""" % ngram_name) #Iterate counting methods i.e. unique, total, or alexa-weighted domains for count_name, (countf, count_friendly_name) in util.count_fxns.items(): print("\033[K%s-grams:" % n) webf.write('<td class="dataCb">\n') #webf.write("<h4>Data for %s counted by %s</h4>\n" % (ngram_name, count_friendly_name)) webf.write("<h4>Counted by %s</h4>\n" % (count_friendly_name)) score_name_str = "Top %s by %%s counted by %s</h4>\n" % (ngram_name, count_friendly_name) #Dump data if BASE_METRICS: #webf.write("Basic metrics:<br/>\n") for mname, (fsc,faux,hname) in metrics.items(): nurl = get_data_dump_url(["metrics", mname], "%s-grams_top_%s_%s%s" % (n, mname, count_name, util.get_file_suffix())) webf.write('<span style="margin-left:2em"><a href="%s" target="_blank">%s</a></span></br>\n' % (nurl, hname)) print("\033[K\t\tGraphs created",end="\r") webf.flush() #We can't combine these dumps because Nelson rules don't provide a score, whereas all of our metrics do if NELSON_RULES: webf.write("Nelson Rules:<br/>\n") for i in range(len(rule_hits)): rnum = i+1 #1 indexed num gen_page = (len(rule_hits[i]) <= 10 * topN) gen_figs = (len(rule_hits[i]) <= 5 * topN) if gen_page: print("\033[K\tNelson rule %d; Hits: %d" % (rnum,len(rule_hits[i]))) hits = sorted(rule_hits[i]) print("\033[K\t\tSorted",end="\r") #Might be better if we cluster everything? group_ct = math.ceil(len(hits) / 10) groups = cluster(hits, gram_freq, group_ct) print("\033[K\t\tClustered",end="\r") nwebfn, nurl = create_data_dump(["nelson"], "%s-grams_nelson-%d_%s%s" % (n, rnum, count_name, util.get_file_suffix()), groups, gram_freq_raw, score_name_str % hname, gen_figs=gen_figs) print("\033[K\t\tGraphs created",end="\r") else: print("\033[K\tNelson rule %d; Hits: %d (skipped)" % (rnum,len(rule_hits[i]))) nurl = get_data_dump_url(["nelson"], "%s-grams_nelson-%d_%s%s" % (n, rnum, count_name, util.get_file_suffix())) webf.write('<span style="margin-left:2em"><a href="%s" target="_blank">Nelson rule %d; Hits: %d</a></span></br>\n' % (nurl, rnum, len(rule_hits[i]))) webf.write('<br/></td>\n') #Best to force a flush for partial readouts webf.flush() os.fsync(webf) webf.write('</tr>') util.close_pool() webf.write(""" </table> </div> </body> </html>""") print("\033[KDone")
def get_lines_url(filename, *args, **kwargs): fn, url = util.get_web_fn('graphs', "lines", '%s.html' % os.path.basename(filename)) return url
def draw_igraph_phrases(n, t): fn, url = util.get_web_fn("networks", '%s_%s.html' % (n, t)) phrases = {} for phrase, metrics in util.load_top_phrases(n, t=t): phrases[phrase] = metrics G = gen_igraph_network(phrases, n) labels = list(G.vs['label']) N = len(labels) E = [e.tuple for e in G.es] # list of edges layt = G.layout('kk') #kamada-kawai layout Xn = [layt[k][0] for k in range(N)] Yn = [layt[k][1] for k in range(N)] Xe = [] Ye = [] for e in E: Xe += [layt[e[0]][0], layt[e[1]][0], None] Ye += [layt[e[0]][1], layt[e[1]][1], None] trace1 = go.Scatter(x=Xe, y=Ye, mode='lines', line=dict(color='rgb(210,210,210)', width=1), hoverinfo='none') trace2 = go.Scatter(x=Xn, y=Yn, mode='markers', name='ntw', marker=dict(symbol='circle-dot', size=5, color='#6959CD', line=dict(color='rgb(50,50,50)', width=0.5)), text=labels, hoverinfo='text') axis = dict( showline=False, # hide axis line, grid, ticklabels and title zeroline=False, showgrid=False, showticklabels=False, title='') width = 800 height = 800 layout = go.Layout( title="Similarity for phrases with n=%s" % n, font=dict(size=12), showlegend=False, autosize=False, width=width, height=height, xaxis=go.layout.XAxis(axis), yaxis=go.layout.YAxis(axis), margin=go.layout.Margin( l=40, r=40, b=85, t=100, ), hovermode='closest', annotations=[ dict(showarrow=False, text='This igraph.Graph has the Kamada-Kawai layout', xref='paper', yref='paper', x=0, y=-0.1, xanchor='left', yanchor='bottom', font=dict(size=14)) ]) data = [trace1, trace2] fig = go.Figure(data=data, layout=layout) plotly.offline.plot(fig, filename=fn, auto_open=False) print("Published to: %s " % url)
print("""Usage: python3 -m historical.draw_freq_dist <n>""") exit(-1) n = sys.argv[1] np.set_printoptions(threshold=sys.maxsize) yss = [ys for ys in util.iter_yearseason()] fig = go.Figure() b = 2 max_count = 0 for ys in yss: freqs = [r[0] for r in util.load_grams(n, ys)] max_count = max(max_count, len(freqs)) log_index = [ int(math.pow(b, i)) for i in range(int(math.log(max_count, b))) ] for ys in yss: freqs = [r[0] for r in util.load_grams(n, ys)] freqs2 = [freqs[i] if i < len(freqs) else 0 for i in log_index] fig.add_trace(go.Scatter(x=log_index, y=freqs2, mode='lines', name=ys)) fig.update_layout(xaxis_type="log") fn, url = util.get_web_fn('graphs', "lines", "aggregate", 'dist_%s.html' % n) # fn = os.path.join(WEB_DIR, 'graphs', "lines", "aggregate", 'dist_%s.html' % n) plotly.offline.plot(fig, filename=fn, auto_open=False) # url="%s/graphs/lines/aggregate/dist_%s.html" % (WEB_PREFIX, n) print("Published to: %s " % url)