Exemple #1
0
def nearby_probes_diff_ldns(svl, rmask=16):

    print("reducing svl to only probes with public ldns")
    svl = [sv for sv in svl if ipp.is_public(sv.get_ldns())]
    print("getting asn descriptor list...")
    asvl = vv.asn_svl(svl)

    nearbies = list()
    strnearbies = list()

    for asn in asvl:
        if asn is None:
            continue
        ssvl = vv.subnet_svl(asvl[asn], 16)
        for subnet in ssvl:
            if subnet is None:
                continue
            if len(ssvl[subnet]) > 1:
                csvl = vv.country_svl(ssvl[subnet])
                for country in csvl:
                    if country is None:
                        continue
                    if len(csvl[country]) > 1:
                        osvl = vv.owner_svl(csvl[country])
                        for owner in osvl:
                            if owner is None:
                                continue
                            if len(osvl[owner]) > 1:
                                resolvers = [z.get_ldns() for z in osvl[owner]]
                                # collapse redundant resolvers in the same
                                # subnet (e.g., 8.8.8.8 and 8.8.4.4 -> 8.8.x.x)
                                fmtmask = ipp.make_v4_prefix_mask(rmask)
                                r2 = defaultdict(list)
                                for r in resolvers:
                                    r2[r & fmtmask].append(r)
                                # keep resolvers with at least 1 probe //2 probes
                                keep = list()
                                k2 = list()
                                for z in r2:
                                    if len(r2[z]) > 1:
                                        # // keep += r2[z]
                                        k2.append(z)
                                    keep += r2[z]
                                #if len(k2) > 0:
                                if len(r2) > 1:
                                    print("has different resolvers!!!")
                                if len(keep
                                       ) > 1:  # if there's stuff to compare
                                    print("found some!!")
                                    print("asn: " + str(asn))
                                    svl = [sv for sv in osvl[owner] if sv.get_ldns() \
                                            in keep]
                                    nearbies.append(svl)
                                    strnearbies.append(["|~"+ipp.int2ip(z.get_ldns())+\
                                            "__"+ipp.int2ip(z.get_ip())+"~|" \
                                            for z in svl])
    df.overwrite(plotsdir + "nearbies.csv", df.list2col(strnearbies))
    logger.warning("nearbies: " + str(len(nearbies)))
    return nearbies
Exemple #2
0
def get_ansset_sizes(start_time, fname="", **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param **kwas: keyword arguments for vv.get_svl()
    :return: (m) matrix of client pairs vs domains,
             (fmt) list of domains

    other outputs:
        -> csv with pairs vs domains matrix (m)
        -> csv with list of domain pair correlations (corrs)
        -> csv with list of mean Jaccard for each domain (means)
    '''
    kwas['start_time'] = start_time
    kwas['return_ccache'] = False
    svl, fmt, anssets = vv.get_svl(**kwas)
    anssets = sorted([(z, len(anssets[z])) for z in anssets],
                     key=lambda p: p[1])
    df.overwrite(plotsdir + "big_ansset" + fname + ".csv",
                 df.list2col(anssets))
Exemple #3
0
def plot_examine_diff_diff(start_time, fname="", loops=2, gap=0,
        thresh=10, **kwas):
    '''
    lines:  1) domain independent cdf of ALL matches
            2-n) cdf of matches for domain with answer space > thresh
            n-m) cdf of matches for ALL domains with answer space < thresh
    '''
    kwas['start_time'] = start_time
    kwas['return_ccache'] = False
    svld, allsvl, allfmt, anssets = mv.arrange_self_data(start_time, gap, loops, **kwas)

    sm = mv.examine_diff_diff(svld)
    vals = [[], []]
    labels = ['all', 'small']
    for dom in sm:
        vals[0] = vals[0] + sm[dom]
        if len(anssets[dom]) < thresh:
            vals[1] += sm[dom]
        else:
            vals.append(sm[dom])
            labels.append(dom)

    fig, ax = plt.subplots(1, 1)
    for i in xrange(0, len(vals)):
        print "*****************"+labels[i]+"*********************"
        print vals[i]
        ecdf = ECDF(vals[i])
        x = list(ecdf.x)
        y = list(ecdf.y)
        ax.plot(x, y, label=labels[i])
    ps.set_dim(fig, ax, xdim=13, ydim=7.5)
    plt.xlabel("diff mask match by domain")
    plt.ylabel("CDF of clients")
    lgd = ps.legend_setup(ax, 4, "top center", True)
    filename = plotsdir+"diff_mask"+fname
    fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close(fig)

    print "saving data..."
    for i in xrange(0, len(vals)):
        outstr = df.overwrite(plotsdir+labels[i]+'_diff_jaccard.csv',
                df.list2col(vals[i]))
Exemple #4
0
def get_domain_matrix(start_time, fname="", **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param **kwas: keyword arguments for vv.get_svl()
    :return: (m) matrix of client pairs vs domains,
             (fmt) list of domains

    other outputs:
        -> csv with pairs vs domains matrix (m)
        -> csv with list of domain pair correlations (corrs)
        -> csv with list of mean Jaccard for each domain (means)
    '''
    kwas['start_time'] = start_time
    kwas['return_ccache'] = False
    svl, fmt, anssets = vv.get_svl(**kwas)
    print "svl len", len(svl)
    combs = fact(len(svl)) / (fact(2) * fact(len(svl) - 2))
    m = np.zeros((combs, len(fmt)))
    p = 0
    for i in xrange(0, len(svl) - 1):
        a = svl[i]
        logger.warning(str(i) + ", " + str(a.get_id()))
        aset = dict()
        for dom in a:
            aset[dom] = set(a[dom])
        for j in xrange(i + 1, len(svl)):
            b = svl[j]
            for k in xrange(0, len(fmt)):
                dom = fmt[k]
                domtotal = sum([a[dom][z] for z in a[dom]]) + sum(
                    [b[dom][z] for z in b[dom]])
                overlap = aset[dom].intersection(b[dom])
                weight = 0
                for z in overlap:
                    weight += (a[dom][z] + b[dom][z])
                m[p, k] = weight / domtotal
            p += 1

    df.overwrite(plotsdir + "dommatrix" + fname + ".csv",
                 df.list2line(fmt) + "\n")
    df.append(plotsdir + "dommatrix" + fname + ".csv", df.list2col(m))

    C = np.corrcoef(m, rowvar=False)
    corrs = list()
    for i in xrange(0, len(fmt) - 1):
        for j in xrange(i + 1, len(fmt)):
            corrs.append((fmt[i] + "_" + fmt[j], C[i, j]))
    corrs = sorted([y for y in corrs if not math.isnan(y[1])],
                   key=lambda z: z[1])
    means = sorted(zip(fmt, np.mean(m, axis=0)), key=lambda z: z[1])

    df.overwrite(plotsdir + "domcorr" + fname + ".csv", df.list2col(corrs))
    df.overwrite(plotsdir + "dommean" + fname + ".csv", df.list2col(means))

    meand = dict(means)
    # get mean jaccard vs # IPs seen
    mj_ni = [(meand[dom], len(anssets[dom])) for dom in meand]
    d_mj_ni = sorted([(dom, meand[dom], len(anssets[dom])) for dom in meand],
                     key=lambda z: z[1])
    df.overwrite(plotsdir + "jaccard_vs_ipspace" + fname + ".csv",
                 df.list2col(d_mj_ni))

    fig, ax = plt.subplots(1, 1)

    colors = iter(cm.rainbow(np.linspace(0, 1, len(mj_ni))))
    for x, y in mj_ni:
        ax.scatter(x, y, color=next(colors))
    plt.xlabel("mean jaccard")
    plt.ylabel("# IPs observed")
    ax.grid(b=True, which='major', color='b', linestyle='-')
    ps.set_dim(fig, ax, ylog=True)
    filename = plotsdir + "jaccard_vs_ipspace" + fname
    fig.savefig(filename + '.png', bbox_inches='tight')
    fig.savefig(filename + '.pdf', bbox_inches='tight')
    plt.show()
    plt.close(fig)

    return m, fmt
Exemple #5
0
def closest_diff_desc(start_time, fname="", xlim=[.6, 1.0], **kwas):
    '''
    :param t: int indicating the earliest query the window should include
    :param fname: string to be appended to end of plot file name
    :returns: [country, ASN, subnet, prefix] pair dictionaries of closeness lists

    gets pairwise closeness of probes with different descriptors to find odd
    behavior (probes in difference descriptors with high closeness scores)

    NOTE: writes data to files for conveniece
    NOTE: accepts vv.get_svl keyword params
    '''
    print("getting svl...")
    kwas['start_time'] = start_time
    svl, __, __, ccache = vv.get_svl(**kwas)
    logger.warning("svl len: " + str(len(svl)))

    print("getting descriptor lists...")
    csvl = vv.country_svl(svl)
    asvl = vv.asn_svl(svl)
    ssvl = vv.subnet_svl(svl)
    psvl = vv.prefix_svl(svl)

    idc = defaultdict(list)
    # {idA_idB:closeness}
    iic = dict()
    # {asnA_asnB: [closeness]}
    ddc = defaultdict(list)
    print("\n\ncalculating closeness for ASNs...")
    asns = [c for c in asvl if len(asvl[c]) > 1]
    for i in xrange(0, len(asns) - 1):
        print(asns[i], end=", ")
        sys.stdout.flush()
        for a in asvl[asns[i]]:
            for j in xrange(i + 1, len(asns)):
                for b in asvl[asns[j]]:
                    closeness = ccache[a][b]
                    ad = str(a.get_asn())
                    bd = str(b.get_asn())
                    aid = str(a.get_id())
                    bid = str(b.get_id())
                    dist = em.latlong_distance_km(a.get_coordinates(),
                                                  b.get_coordinates())
                    dist = distance(closeness, dist)
                    idc[aid + "_" + bd].append((closeness, dist))
                    idc[bid + "_" + ad].append((closeness, dist))
                    iic["_".join(sorted([aid, bid]))] = (closeness, dist)
                    ddc["_".join(sorted([ad, bd]))].append((closeness, dist))
    ccache.dump()


    idac = sorted([(k, np.mean([q[0] for q in idc[k]]), np.mean([q[1] for q in \
            idc[k]])) for k in idc], key=lambda z: z[2], reverse=True)
    idac = [(z[0], z[1]) for z in idac]
    filename = plotsdir + "asn_idac" + fname + ".csv"
    df.overwrite(filename, df.list2col(idac))

    ddac = sorted([(k, np.mean([q[0] for q in ddc[k]]), np.mean([q[1] for q in \
            ddc[k]])) for k in ddc], key=lambda z: z[2], reverse=True)
    ddac = [(z[0], z[1]) for z in ddac]
    filename = plotsdir + "asn_ddac" + fname + ".csv"
    df.overwrite(filename, df.list2col(ddac))

    iic = sorted([(k, iic[k][0], iic[k][1]) for k in iic],
                 reverse=True,
                 key=lambda z: z[2])
    iic = [(z[0], z[1]) for z in iic]
    filename = plotsdir + "asn_iic" + fname + ".csv"
    df.overwrite(filename, df.list2col(iic))

    # {idA_prefixB: [closeness]}
    idc = defaultdict(list)
    # {idA_idB:closeness}
    iic = dict()
    # {prefixA_prefixB: [closeness]}
    ddc = defaultdict(list)
    print("\n\ncalculating closeness for prefixes...")
    prefixes = [c for c in psvl if len(psvl[c]) > 1]
    for i in xrange(0, len(prefixes) - 1):
        print(prefixes[i], end=", ")
        sys.stdout.flush()
        for a in psvl[prefixes[i]]:
            for j in xrange(i + 1, len(prefixes)):
                for b in psvl[prefixes[j]]:
                    closeness = ccache[a][b]
                    ad = str(a.get_prefix())
                    bd = str(b.get_prefix())
                    aid = str(a.get_id())
                    bid = str(b.get_id())
                    dist = em.latlong_distance_km(a.get_coordinates(),
                                                  b.get_coordinates())
                    dist = distance(closeness, dist)
                    idc[aid + "_" + bd].append((closeness, dist))
                    idc[bid + "_" + ad].append((closeness, dist))
                    iic["_".join(sorted([aid, bid]))] = (closeness, dist)
                    ddc["_".join(sorted([ad, bd]))].append((closeness, dist))
    ccache.dump()


    idac = sorted([(k, np.mean([q[0] for q in idc[k]]), np.mean([q[1] for q in \
            idc[k]])) for k in idc], key=lambda z: z[2], reverse=True)
    idac = [(z[0], z[1]) for z in idac]
    filename = plotsdir + "prefix_idac" + fname + ".csv"
    df.overwrite(filename, df.list2col(idac))

    ddac = sorted([(k, np.mean([q[0] for q in ddc[k]]), np.mean([q[1] for q in \
            ddc[k]])) for k in ddc], key=lambda z: z[2], reverse=True)
    ddac = [(z[0], z[1]) for z in ddac]
    filename = plotsdir + "prefix_ddac" + fname + ".csv"
    df.overwrite(filename, df.list2col(ddac))

    iic = sorted([(k, iic[k][0], iic[k][1]) for k in iic],
                 reverse=True,
                 key=lambda z: z[2])
    iic = [(z[0], z[1]) for z in iic]
    filename = plotsdir + "prefix_iic" + fname + ".csv"
    df.overwrite(filename, df.list2col(iic))

    svd = dict()
    for sv in svl:
        svd[sv.get_id()] = sv

    return svd
Exemple #6
0
def plot_measure_expansion(start_time, fname="", loops=31, gap=0, thresh=10,
        **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param gap: the gap (in seconds) between each iteration's dataset
    :param loops: the number of iterations (datasets)
    :param **kwas: keyword arguments for vv.get_svl()

    line plot:
        x -> n (such that it represents the nth iteration)
        y -> # of new IPs observed by a client on nth iteration
        line -> each line corresponds to one domain
    '''

    svld, allsvl, allfmt, anssets = mv.arrange_self_data(start_time, gap, loops, **kwas)
    keys = svld.keys()

    counts = mv.measure_expansion(svld)
    domvals = defaultdict(lambda: defaultdict(list))
    allvals = defaultdict(list)
    smallvals = defaultdict(list)

    for c in counts:
        for dom in c:
            for i, val in enumerate(c[dom]['ratio']):
                allvals[i].append(val)
                if len(anssets[dom]) < thresh:
                    smallvals[i].append(val)
                else:
                    domvals[dom][i].append(val)

    labels = ['all', 'small'] + domvals.keys()
    vals = list()
    for i in labels:
        vals.append([])
    for i in sorted(allvals.keys()):
        vals[0].append(np.mean(allvals[i]))
        vals[1].append(np.mean(smallvals[i]))
        for j, dom in enumerate(labels[2:]):
            vals[j+2].append(np.mean(domvals[dom][i]))

    fig, ax = plt.subplots(1, 1)
    marker = ps.get_markers()
    style = ps.get_styles()
    for i in xrange(0, len(vals)):
        ax.plot(vals[i], label=labels[i], fillstyle='full', marker=next(marker),
                markerfacecolor='white', markevery=6, linestyle=next(style))
    ps.set_dim(fig, ax)
    plt.xlabel("cycle #")
    plt.ylabel("# new IPs / ans. size")
    ax.grid(b=True, which='major', color='b', linestyle='-')
    lgd = ps.legend_setup(ax, 3, "top right", True)
    filename = plotsdir+"expansion"+fname
    fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close(fig)

    print "saving data..."
    for i in xrange(0, len(vals)):
        outstr = df.overwrite(plotsdir+labels[i]+'newvssize.csv',
                df.list2col(vals[i]))
Exemple #7
0
def plot_self_match(start_time, duration, fname="", loops=7, gap=0, thresh=10,
        **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param gap: the gap (in seconds) between each iteration's dataset
    :param loops: the number of iterations (datasets) / 2 (since we need 2 per
        comparison in this case)
    :param **kwas: keyword arguments for vv.get_svl()

    plots CDF:
        x -> closeness to self for back-to-back iteration windows (e.g., days 1-2
        vs days 3-4, days 5-6 vs days 7-8, ...)
        y -> CDF of clients

    NOTE: plot 3.2
    '''
    valsd = defaultdict(list) # {pid: [vals]}
    bigsvld = dict()
    kwas['duration'] = duration
    kwas['return_ccache'] = False
    for i in xrange(0, loops):
        print (start_time+2*duration*i, duration)
        tmp_start = start_time+2*(gap+duration)*i
        svld, allsvl, allfmt, anssets = mv.arrange_self_data(tmp_start, gap, 2, **kwas)

        pids, vals = mv.self_match(svld)
        for pid, val in zip(pids, vals):
            valsd[pid].append(val)
            bigsvld[pid] = svld[pid]

    results = list()
    for pid in valsd:
        results.append((pid, np.mean(valsd[pid])))

    results = sorted(results, key=lambda z: z[1], reverse=True)

    fig, ax = plt.subplots(1, 1)
    ecdf = ECDF([z[1] for z in results])
    x = list(ecdf.x)
    y = list(ecdf.y)
    ax.plot(x, y, color='k')
    ax.axvline(np.median(x), color='r', linestyle='--')
    ps.set_dim(fig, ax, xdim=13, ydim=7.5)
    plt.xlabel("closeness to self")
    plt.ylabel("CDF of clients")
    filename = plotsdir+"self_closeness"+fname
    fig.savefig(filename+'.png', bbox_inches='tight')
    fig.savefig(filename+'.pdf', bbox_inches='tight')
    plt.close(fig)

    labels = defaultdict(list)
    for pid, val in results:
        labels['countries'].append((val, bigsvld[pid][0].get_country()))
        labels['subnets'].append((val, bigsvld[pid][0].get_subnet()))
        labels['prefixes'].append((val, bigsvld[pid][0].get_prefix()))
        labels['asns'].append((val, bigsvld[pid][0].get_asn()))
        labels['resolvers'].append((val, bigsvld[pid][0].get_ldns()))

    for k in labels:
        data = sorted([(y, np.mean([z[0] for z in labels[k] if z[1] == y]),
                len([z[0] for z in labels[k] if z[1] == y])) \
                for y in set([v[1] for v in labels[k]])], key=lambda x: x[1])
        df.overwrite(plotsdir+'self_closeness_'+k+fname+'.csv',
                df.list2col(data))

    print "saving data..."
    outstr = df.overwrite(plotsdir+'self_closeness'+fname+'.csv',
            df.list2col(results))
Exemple #8
0
def plot_closeness_same_desc(start_time, duration, fname="", xlim=[.6, 1.0], rmask=16,
        loops=31, **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param fname: string to be appended to end of plot file name
    :param xlim: x axis limits for plot. Accepts formats: None, [a, b],
    :param rmask: mask for resolver IPs
    :param **kwas: keyword arguments for vv.get_svl()

    for each descriptor (ASN, country, registered prefix, /24 subnet), plot the
    CDF of the pairwise closeness of clients, such that the clients in a pair
    come from the same groups in the descriptor (e.g., same country for the
        country descriptor)

    NOTE: plot 4.1
    '''
    lvals = list()
    cvals = list()
    avals = list()
    svals = list()
    pvals = list()
    kwas['duration'] = duration
    for l in xrange(0, loops):
        print "getting svl..."
        kwas['start_time'] = start_time+duration*l
        svl, fmt, __, ccache = vv.get_svl(**kwas)
        logger.warning("svl len: "+str(len(svl)))

        print "getting descriptor lists..."
        csvl = vv.country_svl(svl)
        asvl = vv.asn_svl(svl)
        ssvl = vv.subnet_svl(svl)
        #osvl = vv.owner_svl(svl)
        psvl = vv.prefix_svl(svl)
        lsvl = vv.ldns_svl(svl, rmask, False)
        fmtmask = ipp.make_v4_prefix_mask(rmask)
        to_remove = [
                '208.67.222.123',   # OpenDNS
                '208.67.220.123',
                '8.8.8.8',          # Google Public DNS
                '8.8.4.4',
                '64.6.64.6',        # Verisign
                '64.6.65.6']
        # remove massive public DNS providers
        for ip in to_remove:
            tmp = ipp.ip2int(ip) & fmtmask
            if tmp in lsvl:
                del lsvl[tmp]

        print "calculating closeness for resolvers..."
        resolvers = lsvl.keys()
        for k in resolvers:
            ksvl = lsvl[k]
            for a in xrange(0, len(ksvl)-1):
                for b in xrange(a+1, len(ksvl)):
                    lvals.append(ccache[ksvl[a]][ksvl[b]])

        print "calculating closeness for countries..."
        countries = csvl.keys()
        for k in countries:
            ksvl = csvl[k]
            for a in xrange(0, len(ksvl)-1):
                for b in xrange(a+1, len(ksvl)):
                    cvals.append(ccache[ksvl[a]][ksvl[b]])
        print "calculating closeness for ASNs..."
        asns = asvl.keys()
        for k in asns:
            ksvl = asvl[k]
            for a in xrange(0, len(ksvl)-1):
                for b in xrange(a+1, len(ksvl)):
                    avals.append(ccache[ksvl[a]][ksvl[b]])
        print "calculating closeness for subnets..."
        subnets = ssvl.keys()
        for k in subnets:
            ksvl = ssvl[k]
            for a in xrange(0, len(ksvl)-1):
                for b in xrange(a+1, len(ksvl)):
                    svals.append(ccache[ksvl[a]][ksvl[b]])
        '''
        print "calculating closeness for owners..."
        ovals = list()
        owners = osvl.keys()
        for k in owners:
            ksvl = osvl[k]
            for a in xrange(0, len(ksvl)-1):
                for b in xrange(a+1, len(ksvl)):
                    ovals.append(ccache[ksvl[a]][ksvl[b]])
        '''
        print "calculating closeness for prefixes..."
        prefixes = psvl.keys()
        for k in prefixes:
            ksvl = psvl[k]
            for a in xrange(0, len(ksvl)-1):
                for b in xrange(a+1, len(ksvl)):
                    pvals.append(ccache[ksvl[a]][ksvl[b]])

    print "plotting..."
    #vals = [cvals, avals, svals, ovals, pvals]
    #labels = ['country', 'ASN', 'subnet', 'owner', 'prefix']
    vals = [cvals, avals, svals, pvals, lvals]
    labels = ['country', 'ASN', 'subnet', 'prefix', 'resolver']

    fig, ax = plt.subplots(1, 1)
    for i in xrange(0, len(vals)):
        print type(vals[i][0])
        print labels[i], "\n"
        print len(vals[i])
        ecdf = ECDF(np.array(vals[i]))
        x = list(ecdf.x)
        y = list(ecdf.y)
        ax.plot(x, y, label=labels[i])
    ps.set_dim(fig, ax, xdim=13, ydim=7.5)
    plt.xlabel("pairwise probe closeness")
    plt.ylabel("CDF of pairs")
    lgd = ps.legend_setup(ax, 4, "top center", True)
    filename = plotsdir+"closeness_same_desc"+fname
    fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close(fig)

    print "saving data..."
    for i in xrange(0, len(vals)):
        outstr = df.overwrite(plotsdir+labels[i]+'_same.csv',
                df.list2col(vals[i]))
    ccache.dump()
Exemple #9
0
def plot_closeness(start_time, duration, fname="", xlim=[.6, 1.0], loops=15, **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param fname: string to be appended to end of plot file name
    :param xlim: x axis limits for plot. Accepts formats: None, [a, b],
    :param loops: number of time blocks
    :param **kwas: keyword arguments for vv.get_svl()

    plots:
        1) CDF for pairwise closeness of each pair
        2) CDF for the average pairwise closeness experienced by each probe
        across all other probes

    NOTE: plot 3.1
    '''
    means = defaultdict(list)
    vals = list()
    kwas['duration'] = duration
    for l in xrange(0, loops):
        print "getting svl..."
        kwas['start_time'] = start_time+duration*l
        svl, __, __, ccache = vv.get_svl(**kwas)
        logger.warning("svl len: "+str(len(svl)))
        print len(svl)

        print "calculating closeness for resolvers..."
        for i in xrange(0, len(svl)-1):
            for j in xrange(i + 1, len(svl)):
                vals.append(ccache[svl[i]][svl[j]])
                means[svl[i].get_id()].append(vals[-1])
                means[svl[j].get_id()].append(vals[-1])
        ccache.dump()
        del ccache, svl, __
        gc.collect()

    print "plotting..."
    fig, ax = plt.subplots(1, 1)

    ecdf = ECDF(vals)
    x = list(ecdf.x)
    y = list(ecdf.y)
    ax.plot(x, y, label="pairwise")

    ecdf = ECDF([np.mean(means[z]) for z in means])
    x = list(ecdf.x)
    y = list(ecdf.y)
    ax.plot(x, y, label="average (per client)")

    ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim)
    plt.xlabel("pairwise probe closeness")
    plt.ylabel("CDF of pairs")
    lgd = ps.legend_setup(ax, 4, "top center", True)
    filename = plotsdir+"overall_closeness"+fname
    fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close(fig)

    print "saving data..."
    df.overwrite(plotsdir+'overall_closeness'+fname+'.csv',
        df.list2col(vals))
    df.overwrite(plotsdir+'overall_avg_closeness'+fname+'.csv',
        df.list2col([(z, np.mean(means[z])) for z in means]))
Exemple #10
0
def plot_optimizing_window(start_time, duration, fname="", xlim=None,
        maxdur=90000*15, incr=30000, **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param fname: string to be appended to end of plot file name
    :param xlim: x axis limits for plot. Accepts formats: None, [a, b],
    :param maxdur: the outer bound of the duration range to be covered
    :param incr: the number of seconds to increment the duration by in each loop
    :param **kwas: keyword arguments for vv.get_svl()

    makes line plot varying the duration (x axis) vs the closeness to one's self
    from a different point in time (e.g., for a 10 second duration, self A would
    be time 0-9, and self B would be time 10-19)
    '''

    allvals = list()
    allbars = list()
    allx = list()
    dur = duration
    kwas['return_ccache'] = False
    while dur < maxdur:
        print "getting svls..."
        kwas['duration'] = dur
        kwas['start_time'] = start_time
        svl, __, __ = vv.get_svl(**kwas)
        logger.warning("svl len: "+str(len(svl)))
        svl1 = dict()
        for sv in svl:
            svl1[sv.id] = sv
        kwas['start_time'] = start_time+dur
        svl, __, __ = vv.get_svl(**kwas)
        logger.warning("svl len: "+str(len(svl)))
        svl2 = dict()
        for sv in svl:
            svl2[sv.id] = sv

        print "calculating closeness for subnets...", dur
        vals = list()
        for pid in svl1:
            if pid in svl2:
                vals.append(vv.closeness(svl1[pid], svl2[pid]))

        allvals.append(np.mean(vals))
        allbars.append(np.std(vals))
        allx.append(float(dur)/(60.0*60.0*8.0))
        dur += incr



    fig, ax = plt.subplots(1, 1)
    ax.errorbar(allx, allvals, yerr=allbars)
    ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim)
    plt.xlabel("# 8 hour cycles in block duration")
    plt.ylabel("average self closeness")
    lgd = ps.legend_setup(ax, 4, "top center", True)
    filename = plotsdir+"avg_self_closeness"+fname
    fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close(fig)

    print "saving data..."
    outstr = df.overwrite(plotsdir+fname+'_avg_self_closeness.csv',
            df.list2col(allvals))
Exemple #11
0
def plot_fmeasure(start_time, method="complete", fname="", Zf=False, **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param method: the linkage method to be used
    :param fname: string to be appended to end of plot file name
    :param **kwas: keyword arguments for vv.get_svl()

    scatter plot:
        x -> max distance threshold
        y -> f-measure
        lineplot -> # components (y)

    other output:
        list of desc. that shared optimal components
    '''

    Z, svl = get_zx(start_time, method, fname, Zf, **kwas)
    dsvl = dict()
    dsvl['country'] = vv.country_svl(svl)
    dsvl['asn'] = vv.asn_svl(svl)
    dsvl['subnet'] = vv.subnet_svl(svl)
    dsvl['prefix'] = vv.prefix_svl(svl)
    dsvl['ldns'] = vv.ldns_svl(svl, rmask, False)
    fmtmask = ipp.make_v4_prefix_mask(rmask)
    to_remove = [
            '208.67.222.123',   # OpenDNS
            '208.67.220.123',
            '8.8.8.8',          # Google Public DNS
            '8.8.4.4',
            '64.6.64.6',        # Verisign
            '64.6.65.6']
    # remove massive public DNS providers
    for ip in to_remove:
        tmp = ipp.ip2int(ip) & fmtmask
        if tmp in lsvl:
            del lsvl[tmp]
    vals = defaultdict(list)
    grouping = dict()
    count = list()
    for max_dist in np.arange(0, 1.01, .01):
        data = defaultdict(lambda: defaultdict(list))
        labels = fcluster(Z, max_dist, criterion='distance')
        clusters = [[(c, svl[z]) for z, c in enumerate(labels) if c == y] \
                for y in set(labels)]
        if len(clusters) > 1 and len(clusters) < len(svl):
            count.append(max_dist, len(clusters))
            for c, blob in clusters:
                for desc in dsvl:
                    cluster = [getattr(sv,"get_"+desc)() for sv in blob]
                    for d in set(cluster):
                        localcount = float(len([z for z in cluster if z == d]))
                        localsize = float(len(cluster))
                        globalsize = float(len(dsvl[desc][d]))
                        precision = localcount / localsize
                        recall = localcount / globalsize
                        fmeasure = (2*precision*recall)/(precision+recall)
                        data[desc][d].append((fmeasure, c, max_dist))
    for desc in data:
        for d in data[desc]:
            maxf, maxc, maxd = max(data[desc][d], key=lambda z: z[0])
            vals[desc].append((maxf, maxd))
            grouping[(desc, maxc, maxd)].append((d, maxf))

    print "plotting..."
    fig, ax = plt.subplots(1, 1)

    vals['resolver'] = vals.pop('ldns')
    colors = iter(cm.rainbow(np.linspace(0, 1, len(vals))))
    for desc in vals:
        y, x = zip(*vals[desc])
        heatmap, xedges, yedges = np.histogram2d(x, y, bins=50)
        extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
        plt.clf()
        plt.imshow(heatmap.T, extent=extent, origin='lower')
        cb = PLT.colorbar()
        cb.set_label('# of '+make_plural(desc))
        plt.xlabel("max distance threshold")
        plt.ylabel("F-measure")
        ax.grid(b=True, which='both', color='b', linestyle='-')
        filename = plotsdir+"fmeasure_"+desc+fname
        fig.savefig(filename+'.png', bbox_inches='tight')
        fig.savefig(filename+'.pdf', bbox_inches='tight')
        plt.close(fig)

    fig, ax = plt.subplots(1, 1)
    x, y = zip(*count)
    ax.plot(x, y)
    plt.xlabel("max distance threshold")
    ax.set_ylabel('# components')
    ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim)
    lgd = ps.legend_setup(ax, 4, "top center", True)
    filename = plotsdir+"component_count"+fname
    fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close(fig)

    print "saving data..."
    df.overwrite(plotsdir+'fmeasure_groups'+fname+'.csv',
        df.list2col(vals))

    groups = [(z[0], groups[z]) for z in groups if len(groups[z]) > 1]
    if len(groups) > 0:
        groups = sorted(groups, key=lambda z: z[0]+str(z[1])+str(z[2]))
        df.overwrite(plotsdir+"fmeasure_groups"+fname+".csv", df.list2col(groups))
Exemple #12
0
qfs = qfb.get_filter()

print qfs, "\n"

msms = jcr.get_measurements(qfs)

print "priming..."
msms = jcr.prime_measurements(msms, 'domain')

rfb = fb.result_filter('dns')
rfb.set_min_probes(1000)
rfb.set_time_window(60 * 60 * 24, 2017, 7, 21)
rfb.manual_set(use_probe_resolver=True)
rfs = rfb.get_filter()

print rfs
print "filtering..."
msms = jcr.filter_measurements(msms, rfs)
print 'msms len', len(msms)

g = None
doms = set()
for m in msms:
    if m['domain'] is not None:
        if 'ripe' not in m['domain']:
            doms.add((m['domain'], m['id']))

df.overwrite('doms.csv', df.list2col(sorted(doms, key=lambda z: z[0])))
df.pickleout('msms.pickle', msms)