def draw_networkx_phrases(n, t):
    fn, url = util.get_web_fn("graphs", "networks", '%s_%s.png' % (n, t))
    phrases = {}
    for phrase, metrics in util.load_top_phrases(n, t=t):
        phrases[phrase] = metrics
    G, G2 = gen_network(phrases, n)
    pos = nx.spring_layout(G)  # positions for all nodes

    elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > 0.5]
    esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= 0.5]

    # nodes
    nx.draw_networkx_nodes(G, pos, node_size=70)

    # edges
    nx.draw_networkx_edges(G, pos, edgelist=elarge, width=2)
    #nx.draw_networkx_edges(G, pos, edgelist=esmall,width=2, alpha=0.5, edge_color='b', style='dashed')

    # labels
    nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')
    plt.axis('off')

    plt.savefig(fn)

    print("Published to: %s " % url)
def get_data_dump_url(folders, basename, **kwargs):
    """
    Has the same call signature and return value as create_data_dump,
    so can be a drop-in replacement for testing
    """
    nwebfn, nurl = util.get_web_fn(*folders, "%s.html" % (basename))
    return nurl
def get_correlation(fn):
    nh_vals = get_originator_nh_vals(fn)
    o_vals, o_doms = zip(*get_originator_vals(fn))
    rank_num = list(range(min(len(o_vals), cap, len(nh_vals))))

    fn, url = util.get_web_fn("graphs", "originator_nh_correlations",
                              "%s%s.html" % (os.path.basename(fn), capN))
    corr = stats.pearsonr(o_vals[:len(rank_num)], nh_vals[:len(rank_num)])

    if True:  #not os.path.exists(fn):
        fig = go.Figure(
            data=go.Scatter(x=rank_num,
                            y=o_vals[:len(rank_num)],
                            text=o_doms,
                            mode='lines',
                            name="Originator"),
            layout=go.Layout(
                title="Originator vs Null Hypothesis (Pearson R=%f, p=%f)" %
                corr,
                xaxis_title="Domains",
                yaxis_title="Influence score"))
        fig.add_trace(
            go.Scatter(x=rank_num,
                       y=nh_vals[:len(rank_num)],
                       mode='lines',
                       name='Null Hypothesis'))
        plotly.offline.plot(fig, filename=fn, auto_open=False)

        print("Published at %s" % url)

    return corr
Beispiel #4
0
def draw_html_heatmap_from_data(n, phrase, ar, l):
    opts = np.get_printoptions()
    np.set_printoptions(threshold=sys.maxsize)
    ar, l = sort_row_data(ar, l)
    ar = list(reversed(ar))
    l = list(reversed(l))

    fn, url = util.get_web_fn("graphs", "html_heatmaps", slugify(phrase))

    yss = [ys for ys in util.iter_yearseason()]

    if len(ar) == 0:
        sys.err.writeln("%s,%s has no values\n" % (n, phrase))
        return url

    with open(fn, "w+") as f:
        f.write("""<html>
<head>
<style>
table, th, td {
  border: 1px solid black;
  border-collapse: collapse;
}
tr:nth-child(even) {background-color: #d2d2d2;
</style>
</head>
<body>
""")

        f.write('<h3>Phrase: %s</h3>' % (phrase, ))
        f.write(
            '<h4><a href="%s" target="_blank">Graphical Heatmap</a></h4>\n' %
            (get_heatmap_url(n, phrase), ))
        f.write('<table>\n')
        headers = ["Domain"] + list(util.iter_yearseason())
        f.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers))
        for domId in range(len(ar)):
            f.write('<tr>\n')
            f.write('<td>%s</td>\n' % l[domId])
            for ysId in range(len(ar[0])):
                if ar[domId][ysId] == 1:
                    policy_url = "https://privacypolicies.cs.princeton.edu/fetch_policy.php?domain=%s&interval=%s_%s" % (
                        l[domId], yss[ysId][:4], yss[ysId][4])  #Policy URL
                    f.write('<td><a href="%s">%s</a></td>\n' %
                            (policy_url, yss[ysId]))
                else:
                    f.write('<td></td>\n')

        f.write('</table>\n')
        f.write("</body></html>")

    np.set_printoptions(opts)
    return url
Beispiel #5
0
def draw_heatmap_from_data(n, phrase, ar, l):
    opts = np.get_printoptions()
    np.set_printoptions(threshold=sys.maxsize)
    ar, l = sort_row_data(ar, l)

    fn, url = util.get_web_fn("graphs", "heatmaps", slugify(phrase))

    yss = [ys for ys in util.iter_yearseason()]

    fig = go.Figure(data=go.Heatmap(z=ar,
                                    y=l,
                                    x=yss,
                                    colorscale=[(0, "#d0d3d4"), (1,
                                                                 "#1a5276")],
                                    showscale=False))
    plotly.offline.plot(fig, filename=fn, auto_open=False)
    np.set_printoptions(opts)
    return url
Beispiel #6
0
def get_correlation(fn):
    a_ranks = get_alexa_rankings()
    o_ranks = get_originator_rankings(fn)
    a_rank_num = [a_ranks[dom] for dom in o_ranks]
    o_rank_num = list(range(1,len(o_ranks)+1))

    fn, url = util.get_web_fn("graphs", "originator_correlations", "%s.html" % os.path.basename(fn))

    if True:#not os.path.exists(fn):
        fig = go.Figure(data=go.Scatter(x=o_rank_num, y=o_rank_num, text=o_ranks, mode='lines',name="Originator Rank"))
        fig.add_trace(go.Scatter(x=o_rank_num, y=a_rank_num,
                                 text=o_ranks,
                                 mode='lines',
                                 name='Alexa Rank'))
        plotly.offline.plot(fig, filename=fn, auto_open=False)
    
        print("Published at %s" % url)
    
    return stats.pearsonr(a_rank_num, o_rank_num)
Beispiel #7
0
def get_correlation(fn1, fn2):
    o1_ranks = get_originator_rankings(fn1)
    o2_ranks = get_originator_rankings(fn2)

    if len(o1_ranks) > len(o2_ranks):
        tmp = o1_ranks, fn1
        o1_ranks = o2_ranks
        fn1 = fn2
        o2_ranks, fn2 = tmp
        print("Swapped")

    o2_rank_dict = {o2_ranks[i]: i + 1 for i in range(len(o2_ranks))}
    o1_rank_num = list(range(1, len(o1_ranks) + 1))
    o2_rank_num = [
        o2_rank_dict[d] if d in o2_rank_dict else len(o2_rank_dict)
        for d in o1_ranks
    ]

    fn, url = util.get_web_fn(
        "graphs", "co_originator_correlations",
        "%s-%s.html" % (os.path.basename(fn1), os.path.basename(fn2)))

    if True:  #not os.path.exists(fn):
        fig = go.Figure(data=go.Scatter(x=list(range(len(o1_rank_num))),
                                        y=o1_rank_num,
                                        text=o1_ranks,
                                        mode='lines',
                                        name=os.path.basename(fn1)))
        fig.add_trace(
            go.Scatter(x=list(range(len(o2_rank_num))),
                       y=o2_rank_num,
                       text=o1_ranks,
                       mode='lines',
                       name=os.path.basename(fn2)))
        plotly.offline.plot(fig, filename=fn, auto_open=False)

        print("Published at %s" % url)

    print(len(o1_rank_num), len(o2_rank_num))
    print(o1_rank_num)
    print(o2_rank_num)
    return stats.pearsonr(o1_rank_num, o2_rank_num)
Beispiel #8
0
def draw_lines(filename, topX, freq_data=None):
    """
    Frequency data is an array of tuples. The first item is the phrase,
    the second item is the number of occurances at each interval
    """

    opts = np.get_printoptions()
    np.set_printoptions(threshold=sys.maxsize)

    fn, url = util.get_web_fn('graphs', "lines",
                              '%s.html' % os.path.basename(filename))
    if os.path.exists(fn):
        #        print("Skipping line drawing")
        return url

    yss = [ys for ys in util.iter_yearseason()]

    if freq_data is None:
        freq_data = get_frequency_data(filename, maxCount=topX)

    freq_data = list(freq_data)

    fig = go.Figure()

    #    layout=go.Layout(
    #            legend=dict(x=-.1, y=-0.5*(len(freq_data) // 10))
    #        )

    for label, data in freq_data:
        #wrappedlabel = textwrap.fill(label, 40)
        if len(label) >= 40:
            wrappedlabel = label[:37] + "..."
        else:
            wrappedlabel = label
        fig.add_trace(
            go.Scatter(x=yss, y=data, mode='lines', name=wrappedlabel))
    plotly.offline.plot(fig, filename=fn, auto_open=False)
    np.set_printoptions(opts)
    return url
def do_analytics(gram_list):
    global SLOPE_COUNT_THRESH, SLOPE_PSEUDOCOUNT

    webfn, weburl = util.get_web_fn("raw", "analytics.html")
    logging.info("Access at %s" % weburl)

    #logging.info("Pre-loading Alexa results")
    #alexa.load()

    logging.info("Starting Counting intervals")
    num_intervals = len(list(util.iter_yearseason()))
    logging.info("Starting counting policies")
    domain_counts, policy_counts = get_policy_counts()

    average_policy_count = sum(policy_counts.values()) / len(policy_counts)
    policy_norm_factor = {
        yearseason: 1 / policy_counts[yearseason]
        for yearseason in util.iter_yearseason()
    }

    average_domain_count = sum(domain_counts.values()) / len(domain_counts)
    domain_norm_factor = {
        yearseason: 1 / domain_counts[yearseason]
        for yearseason in util.iter_yearseason()
    }

    with open(webfn, "w+") as webf:
        webf.write("""
<html>
    <head>
        <link rel="stylesheet" type="text/css" href="/styles/style.css"/>
    </head>
<body>
""")
        webf.write("""

<div class="query">
<iframe id="queryframe" src="https://privacypolicies.cs.princeton.edu/search_policies.php?basedir=%s">
</iframe>
</div>

<div class="metricsd"><table class="metricst">
        """ % util.testing_prefix)

        #Iterate phrase type
        for n in gram_list:
            ngram_name = "%s-grams" % n if n not in friendly_names else friendly_names[
                n]
            webf.write("""
<tr class="nhtr">
<td><h3>Data for %s</h3></td>
</tr>
<tr class="ntr">""" % ngram_name)
            logging.info("Loading top grams for %s-grams" % (n))
            gram_freq_by_countfxn = load_top_grams(n, domain_norm_factor,
                                                   policy_norm_factor,
                                                   domain_counts)
            logging.info("Done loading top grams for %s-grams" % (n))
            mem_trace()
            if gram_freq_by_countfxn is None:
                logging.error("No grams")
                continue
            gram_freq_raw = gram_freq_by_countfxn["raw"]
            #Iterate counting methods i.e. unique, total, or alexa-weighted domains
            for count_name, (countf,
                             count_friendly_name) in util.count_fxns.items():
                logging.info("\tBeginning scoring with %s" % (count_name))
                #Score based on various metrics
                gram_freq = gram_freq_by_countfxn[count_name]

                base_count = None
                if count_name == "total":
                    base_count = 1 / average_domain_count
                elif count_name == "unique":
                    base_count = 1 / average_policy_count
                elif count_name == "alexa":
                    base_count = alexa.average_traffic
                slope_count_thresh = get_slope_thresh(gram_freq, base_count)
                slope_pseudocount = base_count
                logging.info("\tSlope thresh for round is: %0.4E" %
                             slope_count_thresh)
                logging.info("\tSlope pseudocount for round is: %0.4E" %
                             slope_pseudocount)
                gram_scores = {mname: [] for mname in metrics}

                #Identifying phrases of interest based on metrics & rules
                for s in gram_freq:

                    vals = gram_freq[s]
                    for (mname, (score_fxn, hname)) in metrics.items():
                        heap = gram_scores[mname]
                        score = score_fxn(vals, slope_pseudocount,
                                          slope_count_thresh)
                        if score == -100000: continue
                        if len(heap) >= topN:
                            heapq.heappushpop(heap, (score, s))
                        else:
                            heapq.heappush(heap, (score, s))

                logging.info("\tDone scoring")
                webf.write('<td class="dataCb">\n')
                webf.write("<h4>Counted by %s</h4>\n" % (count_friendly_name))

                score_name_str = "Top %s by %%s counted by %s</h4>\n" % (
                    ngram_name, count_friendly_name)

                #Dump data
                for mname, (fsc, hname) in metrics.items():
                    heap = gram_scores[mname]
                    logging.info("\tSorting top values for %s" % (mname))

                    #Heaps aren't sorted, we need to sort the heap
                    #Taking advantage of the heap structure doesn't help us here... pop is log(n), and we need n iterations
                    heap = sorted(heap, reverse=True)
                    phrases = [s for sc, s in heap]

                    groups = [
                        phrases[10 * i:min(10 * (i + 1), len(phrases))]
                        for i in range(math.ceil(len(phrases) / 10))
                    ]
                    scores = {s: sc for sc, s in heap}

                    logging.info("\tStarting data dump for %s" % (mname))
                    nwebfn, nurl = create_data_dump(
                        n, ["metrics", mname],
                        "%s-grams_top_%s_%s%s" %
                        (n, mname, count_name, util.get_file_suffix()),
                        groups,
                        gram_freq,
                        gram_freq_raw,
                        score_name_str % hname,
                        scores=scores)
                    webf.write(
                        '<span style="margin-left:2em"><a href="%s" target="_blank">%s</a></span></br>\n'
                        % (nurl, hname))
                    print("\033[K\t\tGraphs created", end="\r")
                    webf.flush()

                webf.write('<br/></td>\n')

                #Best to force a flush for partial readouts
                webf.flush()
                os.fsync(webf)
                mem_trace()
                print("\033[K", end="\r")
                logging.info("\tDone dumping data")
                mem_count()

            webf.write('</tr>')
            util.close_pool()

        webf.write("""
</table>
</div>
</body>
        </html>""")
        logging.info("Done")
def create_data_dump(n,
                     folders,
                     basename,
                     groups,
                     gram_freq,
                     gram_freq_raw,
                     score_name_str,
                     scores=None,
                     gen_figs=False):
    """
    Take the given phrases and create:
    - A table of phrase, score (if available), occurances
    - Line graphs for phrase occurance
    - Heatmaps for phrases
    """
    nwebfn, nurl = util.get_web_fn(*folders, "%s.html" % (basename))
    group_to_graph = {}
    hit_to_graph = {}

    #Part one -- create CSV data files, heatmaps, and line graphs
    for group_num in range(len(groups)):
        group_hits = groups[group_num]
        nfn = os.path.join(util.OUT_DIR, "%s_%d.csv" % (basename, group_num))
        if gen_figs:
            #Line graph
            line_url = draw_lines.draw_lines(nfn, -1, ((hit, gram_freq[hit])
                                                       for hit in group_hits))
            logging.info("\t\tDone drawling lines for group %d" % (group_num))
            #Heatmp
            if n == 'w':
                html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls(
                    n, group_hits)
            else:
                heatmap_urls, html_heatmap_urls = draw_heatmap.draw_html_and_reg_heatmaps(
                    n, group_hits, gram_freq_raw)
            logging.info("\033[K\t\tDone drawing heatmaps for group %d" %
                         (group_num))
        else:  #Skip drawing
            line_url = draw_lines.get_lines_url(nfn, -1)
            heatmap_urls = draw_heatmap.get_multiple_heatmap_urls(
                n, group_hits)
            html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls(
                n, group_hits)
        group_to_graph[group_num] = line_url
        with open(nfn, "w+") as f:  #Write CSV data file for later use
            writer = csv.writer(f)
            for hit in group_hits:
                vals_str = ["%0.2f" % f for f in gram_freq[hit]]
                writer.writerow([
                    hit.replace("\\", "\\\\").replace("\n", "\\n"), *vals_str
                ])
                #heatmap_url = heatmap_urls[hit]
                heatmap_url = html_heatmap_urls[hit]
                #                print(heatmap_url)
                hit_to_graph[hit] = heatmap_url
        logging.info("\t\tDone drawing group %d" % (group_num))

    #Part two -- create usable web page to explore
    with open(nwebfn, "w+") as nwebf:
        nwebf.write("""<html>
<head>
<style>
table, th, td {
  border: 1px solid black;
  border-collapse: collapse;
}
</style>
</head>
<body>
""")
        pool = util.get_pool()
        all_hits = list(
            itertools.chain(*[[hit for hit in group] for group in groups]))

        plots = pool.starmap(miniplot.get_plot_as_img,
                             [(ys_list, gram_freq[hit]) for hit in all_hits])
        plots = list(plots)
        logging.info("\t\tDone drawing plots")

        #Break up by group
        pNum = 0
        for group, gid in zip(groups, range(len(groups))):
            if len(group) == 0: continue
            line_graph_url = group_to_graph[gid]
            nwebf.write('<h3>%s</h3>' % (score_name_str))
            nwebf.write(
                '<h4><a href="%s" target="_blank">Line graph for group %d</a></h4>\n'
                % (line_graph_url, gid))
            nwebf.write('<table>\n')
            headers = ["Score", "Phrase", "Bar Plot"] + list(
                util.iter_yearseason())
            if scores is None:
                headers = headers[1:]
            nwebf.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers))
            #Then phrase
            for hit in group:
                nwebf.write('<tr>\n')
                if scores is not None:
                    if abs(scores[hit]) < 0.01 and scores[hit] != 0:
                        nwebf.write('<td>%0.2E</td>' % scores[hit])
                    else:
                        nwebf.write('<td>%0.2f</td>' % scores[hit])
                nwebf.write('<td width="30%">\n')
                #vals_str = ["%0.2f" % f for f in gram_freq[hit]]
                vals_str = [
                    ("%0.2E" if
                     (abs(gram_freq[hit][ysid]) != 0
                      and abs(gram_freq[hit][ysid]) < 0.01) else "%0.2f") %
                    (gram_freq[hit][ysid]) for ysid in range(num_intervals)
                ]

                heatmap_graph_url = hit_to_graph[hit]
                hit_link = '<a href="%s" target="_blank">%s</a>' % (
                    heatmap_graph_url, hit.replace("\\", "\\\\").replace(
                        "\n", "\\n"))

                plot = plots[
                    pNum]  # miniplot.get_plot_as_img(ys_list,gram_freq[hit])
                pNum += 1

                nwebf.write('</td><td>'.join([hit_link, plot,
                                              *vals_str]))  #TODO add miniplot
                nwebf.write('</td></tr>\n')
            logging.info("\t\tDone with table for group %d" % (gid))

            nwebf.write('</table>\n')
        nwebf.write("</body></html>")
    mem_count()
    return nwebfn, nurl
Beispiel #11
0
def get_html_heatmap_loc(n, phrase):
    """
    Placeholder for faster runtime until we have a more efficient heatmap generator
    """
    fn, url = util.get_web_fn("graphs", "html_heatmaps", slugify(phrase))
    return fn, url
Beispiel #12
0
def do_analytics():
    global SLOPE_COUNT_THRESH

    
    gram_list = list(range(start,stop))
    if SENTENCES:
        gram_list.append("s")
    if ENTITIES:
        gram_list.append("e")
        gram_list.append("u")
        gram_list.append("m")
        #gram_list.append("n")
    if WORDS:
        gram_list.append("w")
    
    webfn, weburl = util.get_web_fn("raw", "analytics.html")
    print("\033[KAccess at %s" % weburl)
    with open(webfn, "w+") as webf:
        webf.write("""
<html>
    <head>
        <link rel="stylesheet" type="text/css" href="/styles/style.css"/>
    </head>
<body>
""")
        webf.write("""

<div class="query">
<iframe id="queryframe" src="https://privacypolicies.cs.princeton.edu/search_policies.php?basedir=%s">
</iframe>
</div>

<div class="metricsd"><table class="metricst">
        """  % util.testing_prefix)

        #Iterate phrase type
        for n in gram_list:
            ngram_name = "%s-grams" % n if n not in friendly_names else friendly_names[n]
            webf.write("""
<tr class="nhtr">
<td><h3>Data for %s</h3></td>
</tr>
<tr class="ntr">""" % ngram_name)
            #Iterate counting methods i.e. unique, total, or alexa-weighted domains
            for count_name, (countf, count_friendly_name) in util.count_fxns.items():

                print("\033[K%s-grams:" % n)
                webf.write('<td class="dataCb">\n')
                #webf.write("<h4>Data for %s counted by %s</h4>\n" % (ngram_name, count_friendly_name))
                webf.write("<h4>Counted by %s</h4>\n" % (count_friendly_name))

                score_name_str = "Top %s by %%s counted by %s</h4>\n" % (ngram_name, count_friendly_name)

                #Dump data
                if BASE_METRICS:
                    #webf.write("Basic metrics:<br/>\n")
                    for mname, (fsc,faux,hname) in metrics.items():
                        nurl = get_data_dump_url(["metrics", mname], "%s-grams_top_%s_%s%s" % (n, mname, count_name, util.get_file_suffix()))
                        webf.write('<span style="margin-left:2em"><a href="%s" target="_blank">%s</a></span></br>\n' % (nurl, hname))
                        print("\033[K\t\tGraphs created",end="\r")
                        webf.flush()

                        
                #We can't combine these dumps because Nelson rules don't provide a score, whereas all of our metrics do
                if NELSON_RULES:
                    webf.write("Nelson Rules:<br/>\n")
                    for i in range(len(rule_hits)):

                        rnum = i+1 #1 indexed num
                        gen_page = (len(rule_hits[i]) <= 10 * topN)
                        gen_figs = (len(rule_hits[i]) <= 5 * topN)
                        if gen_page:

                            print("\033[K\tNelson rule %d; Hits: %d" % (rnum,len(rule_hits[i])))

                            hits = sorted(rule_hits[i])
                            print("\033[K\t\tSorted",end="\r")

                            #Might be better if we cluster everything?
                            group_ct = math.ceil(len(hits) / 10)

                            groups = cluster(hits, gram_freq, group_ct)
                            print("\033[K\t\tClustered",end="\r")

                            nwebfn, nurl = create_data_dump(["nelson"], "%s-grams_nelson-%d_%s%s" % (n, rnum, count_name, util.get_file_suffix()), groups, gram_freq_raw, score_name_str % hname, gen_figs=gen_figs)
                            print("\033[K\t\tGraphs created",end="\r")
                        else:
                            print("\033[K\tNelson rule %d; Hits: %d (skipped)" % (rnum,len(rule_hits[i])))
                            nurl = get_data_dump_url(["nelson"], "%s-grams_nelson-%d_%s%s" % (n, rnum, count_name, util.get_file_suffix()))
                        webf.write('<span style="margin-left:2em"><a href="%s" target="_blank">Nelson rule %d; Hits: %d</a></span></br>\n' % (nurl, rnum, len(rule_hits[i])))

                webf.write('<br/></td>\n')
                
                #Best to force a flush for partial readouts
                webf.flush()
                os.fsync(webf)

            
            webf.write('</tr>')
            util.close_pool()



        webf.write("""
</table>
</div>
</body>
        </html>""")
        print("\033[KDone")
Beispiel #13
0
def get_lines_url(filename, *args, **kwargs):
    fn, url = util.get_web_fn('graphs', "lines",
                              '%s.html' % os.path.basename(filename))
    return url
def draw_igraph_phrases(n, t):
    fn, url = util.get_web_fn("networks", '%s_%s.html' % (n, t))
    phrases = {}
    for phrase, metrics in util.load_top_phrases(n, t=t):
        phrases[phrase] = metrics
    G = gen_igraph_network(phrases, n)

    labels = list(G.vs['label'])
    N = len(labels)
    E = [e.tuple for e in G.es]  # list of edges
    layt = G.layout('kk')  #kamada-kawai layout

    Xn = [layt[k][0] for k in range(N)]
    Yn = [layt[k][1] for k in range(N)]
    Xe = []
    Ye = []
    for e in E:
        Xe += [layt[e[0]][0], layt[e[1]][0], None]
        Ye += [layt[e[0]][1], layt[e[1]][1], None]

    trace1 = go.Scatter(x=Xe,
                        y=Ye,
                        mode='lines',
                        line=dict(color='rgb(210,210,210)', width=1),
                        hoverinfo='none')
    trace2 = go.Scatter(x=Xn,
                        y=Yn,
                        mode='markers',
                        name='ntw',
                        marker=dict(symbol='circle-dot',
                                    size=5,
                                    color='#6959CD',
                                    line=dict(color='rgb(50,50,50)',
                                              width=0.5)),
                        text=labels,
                        hoverinfo='text')

    axis = dict(
        showline=False,  # hide axis line, grid, ticklabels and  title
        zeroline=False,
        showgrid=False,
        showticklabels=False,
        title='')

    width = 800
    height = 800
    layout = go.Layout(
        title="Similarity for phrases with n=%s" % n,
        font=dict(size=12),
        showlegend=False,
        autosize=False,
        width=width,
        height=height,
        xaxis=go.layout.XAxis(axis),
        yaxis=go.layout.YAxis(axis),
        margin=go.layout.Margin(
            l=40,
            r=40,
            b=85,
            t=100,
        ),
        hovermode='closest',
        annotations=[
            dict(showarrow=False,
                 text='This igraph.Graph has the Kamada-Kawai layout',
                 xref='paper',
                 yref='paper',
                 x=0,
                 y=-0.1,
                 xanchor='left',
                 yanchor='bottom',
                 font=dict(size=14))
        ])

    data = [trace1, trace2]
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.plot(fig, filename=fn, auto_open=False)

    print("Published to: %s " % url)
Beispiel #15
0
        print("""Usage:
python3 -m historical.draw_freq_dist <n>""")
        exit(-1)
    n = sys.argv[1]

    np.set_printoptions(threshold=sys.maxsize)

    yss = [ys for ys in util.iter_yearseason()]

    fig = go.Figure()

    b = 2
    max_count = 0
    for ys in yss:
        freqs = [r[0] for r in util.load_grams(n, ys)]
        max_count = max(max_count, len(freqs))
    log_index = [
        int(math.pow(b, i)) for i in range(int(math.log(max_count, b)))
    ]
    for ys in yss:
        freqs = [r[0] for r in util.load_grams(n, ys)]
        freqs2 = [freqs[i] if i < len(freqs) else 0 for i in log_index]
        fig.add_trace(go.Scatter(x=log_index, y=freqs2, mode='lines', name=ys))
    fig.update_layout(xaxis_type="log")
    fn, url = util.get_web_fn('graphs', "lines", "aggregate",
                              'dist_%s.html' % n)
    #    fn = os.path.join(WEB_DIR, 'graphs', "lines", "aggregate", 'dist_%s.html' % n)
    plotly.offline.plot(fig, filename=fn, auto_open=False)
    #    url="%s/graphs/lines/aggregate/dist_%s.html" % (WEB_PREFIX, n)
    print("Published to: %s " % url)