def nearby_probes_diff_ldns(svl, rmask=16): print("reducing svl to only probes with public ldns") svl = [sv for sv in svl if ipp.is_public(sv.get_ldns())] print("getting asn descriptor list...") asvl = vv.asn_svl(svl) nearbies = list() strnearbies = list() for asn in asvl: if asn is None: continue ssvl = vv.subnet_svl(asvl[asn], 16) for subnet in ssvl: if subnet is None: continue if len(ssvl[subnet]) > 1: csvl = vv.country_svl(ssvl[subnet]) for country in csvl: if country is None: continue if len(csvl[country]) > 1: osvl = vv.owner_svl(csvl[country]) for owner in osvl: if owner is None: continue if len(osvl[owner]) > 1: resolvers = [z.get_ldns() for z in osvl[owner]] # collapse redundant resolvers in the same # subnet (e.g., 8.8.8.8 and 8.8.4.4 -> 8.8.x.x) fmtmask = ipp.make_v4_prefix_mask(rmask) r2 = defaultdict(list) for r in resolvers: r2[r & fmtmask].append(r) # keep resolvers with at least 1 probe //2 probes keep = list() k2 = list() for z in r2: if len(r2[z]) > 1: # // keep += r2[z] k2.append(z) keep += r2[z] #if len(k2) > 0: if len(r2) > 1: print("has different resolvers!!!") if len(keep ) > 1: # if there's stuff to compare print("found some!!") print("asn: " + str(asn)) svl = [sv for sv in osvl[owner] if sv.get_ldns() \ in keep] nearbies.append(svl) strnearbies.append(["|~"+ipp.int2ip(z.get_ldns())+\ "__"+ipp.int2ip(z.get_ip())+"~|" \ for z in svl]) df.overwrite(plotsdir + "nearbies.csv", df.list2col(strnearbies)) logger.warning("nearbies: " + str(len(nearbies))) return nearbies
def get_ansset_sizes(start_time, fname="", **kwas): ''' :param start_time: int indicating the earliest query the window should include :param **kwas: keyword arguments for vv.get_svl() :return: (m) matrix of client pairs vs domains, (fmt) list of domains other outputs: -> csv with pairs vs domains matrix (m) -> csv with list of domain pair correlations (corrs) -> csv with list of mean Jaccard for each domain (means) ''' kwas['start_time'] = start_time kwas['return_ccache'] = False svl, fmt, anssets = vv.get_svl(**kwas) anssets = sorted([(z, len(anssets[z])) for z in anssets], key=lambda p: p[1]) df.overwrite(plotsdir + "big_ansset" + fname + ".csv", df.list2col(anssets))
def plot_examine_diff_diff(start_time, fname="", loops=2, gap=0, thresh=10, **kwas): ''' lines: 1) domain independent cdf of ALL matches 2-n) cdf of matches for domain with answer space > thresh n-m) cdf of matches for ALL domains with answer space < thresh ''' kwas['start_time'] = start_time kwas['return_ccache'] = False svld, allsvl, allfmt, anssets = mv.arrange_self_data(start_time, gap, loops, **kwas) sm = mv.examine_diff_diff(svld) vals = [[], []] labels = ['all', 'small'] for dom in sm: vals[0] = vals[0] + sm[dom] if len(anssets[dom]) < thresh: vals[1] += sm[dom] else: vals.append(sm[dom]) labels.append(dom) fig, ax = plt.subplots(1, 1) for i in xrange(0, len(vals)): print "*****************"+labels[i]+"*********************" print vals[i] ecdf = ECDF(vals[i]) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label=labels[i]) ps.set_dim(fig, ax, xdim=13, ydim=7.5) plt.xlabel("diff mask match by domain") plt.ylabel("CDF of clients") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"diff_mask"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." for i in xrange(0, len(vals)): outstr = df.overwrite(plotsdir+labels[i]+'_diff_jaccard.csv', df.list2col(vals[i]))
def get_domain_matrix(start_time, fname="", **kwas): ''' :param start_time: int indicating the earliest query the window should include :param **kwas: keyword arguments for vv.get_svl() :return: (m) matrix of client pairs vs domains, (fmt) list of domains other outputs: -> csv with pairs vs domains matrix (m) -> csv with list of domain pair correlations (corrs) -> csv with list of mean Jaccard for each domain (means) ''' kwas['start_time'] = start_time kwas['return_ccache'] = False svl, fmt, anssets = vv.get_svl(**kwas) print "svl len", len(svl) combs = fact(len(svl)) / (fact(2) * fact(len(svl) - 2)) m = np.zeros((combs, len(fmt))) p = 0 for i in xrange(0, len(svl) - 1): a = svl[i] logger.warning(str(i) + ", " + str(a.get_id())) aset = dict() for dom in a: aset[dom] = set(a[dom]) for j in xrange(i + 1, len(svl)): b = svl[j] for k in xrange(0, len(fmt)): dom = fmt[k] domtotal = sum([a[dom][z] for z in a[dom]]) + sum( [b[dom][z] for z in b[dom]]) overlap = aset[dom].intersection(b[dom]) weight = 0 for z in overlap: weight += (a[dom][z] + b[dom][z]) m[p, k] = weight / domtotal p += 1 df.overwrite(plotsdir + "dommatrix" + fname + ".csv", df.list2line(fmt) + "\n") df.append(plotsdir + "dommatrix" + fname + ".csv", df.list2col(m)) C = np.corrcoef(m, rowvar=False) corrs = list() for i in xrange(0, len(fmt) - 1): for j in xrange(i + 1, len(fmt)): corrs.append((fmt[i] + "_" + fmt[j], C[i, j])) corrs = sorted([y for y in corrs if not math.isnan(y[1])], key=lambda z: z[1]) means = sorted(zip(fmt, np.mean(m, axis=0)), key=lambda z: z[1]) df.overwrite(plotsdir + "domcorr" + fname + ".csv", df.list2col(corrs)) df.overwrite(plotsdir + "dommean" + fname + ".csv", df.list2col(means)) meand = dict(means) # get mean jaccard vs # IPs seen mj_ni = [(meand[dom], len(anssets[dom])) for dom in meand] d_mj_ni = sorted([(dom, meand[dom], len(anssets[dom])) for dom in meand], key=lambda z: z[1]) df.overwrite(plotsdir + "jaccard_vs_ipspace" + fname + ".csv", df.list2col(d_mj_ni)) fig, ax = plt.subplots(1, 1) colors = iter(cm.rainbow(np.linspace(0, 1, len(mj_ni)))) for x, y in mj_ni: ax.scatter(x, y, color=next(colors)) plt.xlabel("mean jaccard") plt.ylabel("# IPs observed") ax.grid(b=True, which='major', color='b', linestyle='-') ps.set_dim(fig, ax, ylog=True) filename = plotsdir + "jaccard_vs_ipspace" + fname fig.savefig(filename + '.png', bbox_inches='tight') fig.savefig(filename + '.pdf', bbox_inches='tight') plt.show() plt.close(fig) return m, fmt
def closest_diff_desc(start_time, fname="", xlim=[.6, 1.0], **kwas): ''' :param t: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :returns: [country, ASN, subnet, prefix] pair dictionaries of closeness lists gets pairwise closeness of probes with different descriptors to find odd behavior (probes in difference descriptors with high closeness scores) NOTE: writes data to files for conveniece NOTE: accepts vv.get_svl keyword params ''' print("getting svl...") kwas['start_time'] = start_time svl, __, __, ccache = vv.get_svl(**kwas) logger.warning("svl len: " + str(len(svl))) print("getting descriptor lists...") csvl = vv.country_svl(svl) asvl = vv.asn_svl(svl) ssvl = vv.subnet_svl(svl) psvl = vv.prefix_svl(svl) idc = defaultdict(list) # {idA_idB:closeness} iic = dict() # {asnA_asnB: [closeness]} ddc = defaultdict(list) print("\n\ncalculating closeness for ASNs...") asns = [c for c in asvl if len(asvl[c]) > 1] for i in xrange(0, len(asns) - 1): print(asns[i], end=", ") sys.stdout.flush() for a in asvl[asns[i]]: for j in xrange(i + 1, len(asns)): for b in asvl[asns[j]]: closeness = ccache[a][b] ad = str(a.get_asn()) bd = str(b.get_asn()) aid = str(a.get_id()) bid = str(b.get_id()) dist = em.latlong_distance_km(a.get_coordinates(), b.get_coordinates()) dist = distance(closeness, dist) idc[aid + "_" + bd].append((closeness, dist)) idc[bid + "_" + ad].append((closeness, dist)) iic["_".join(sorted([aid, bid]))] = (closeness, dist) ddc["_".join(sorted([ad, bd]))].append((closeness, dist)) ccache.dump() idac = sorted([(k, np.mean([q[0] for q in idc[k]]), np.mean([q[1] for q in \ idc[k]])) for k in idc], key=lambda z: z[2], reverse=True) idac = [(z[0], z[1]) for z in idac] filename = plotsdir + "asn_idac" + fname + ".csv" df.overwrite(filename, df.list2col(idac)) ddac = sorted([(k, np.mean([q[0] for q in ddc[k]]), np.mean([q[1] for q in \ ddc[k]])) for k in ddc], key=lambda z: z[2], reverse=True) ddac = [(z[0], z[1]) for z in ddac] filename = plotsdir + "asn_ddac" + fname + ".csv" df.overwrite(filename, df.list2col(ddac)) iic = sorted([(k, iic[k][0], iic[k][1]) for k in iic], reverse=True, key=lambda z: z[2]) iic = [(z[0], z[1]) for z in iic] filename = plotsdir + "asn_iic" + fname + ".csv" df.overwrite(filename, df.list2col(iic)) # {idA_prefixB: [closeness]} idc = defaultdict(list) # {idA_idB:closeness} iic = dict() # {prefixA_prefixB: [closeness]} ddc = defaultdict(list) print("\n\ncalculating closeness for prefixes...") prefixes = [c for c in psvl if len(psvl[c]) > 1] for i in xrange(0, len(prefixes) - 1): print(prefixes[i], end=", ") sys.stdout.flush() for a in psvl[prefixes[i]]: for j in xrange(i + 1, len(prefixes)): for b in psvl[prefixes[j]]: closeness = ccache[a][b] ad = str(a.get_prefix()) bd = str(b.get_prefix()) aid = str(a.get_id()) bid = str(b.get_id()) dist = em.latlong_distance_km(a.get_coordinates(), b.get_coordinates()) dist = distance(closeness, dist) idc[aid + "_" + bd].append((closeness, dist)) idc[bid + "_" + ad].append((closeness, dist)) iic["_".join(sorted([aid, bid]))] = (closeness, dist) ddc["_".join(sorted([ad, bd]))].append((closeness, dist)) ccache.dump() idac = sorted([(k, np.mean([q[0] for q in idc[k]]), np.mean([q[1] for q in \ idc[k]])) for k in idc], key=lambda z: z[2], reverse=True) idac = [(z[0], z[1]) for z in idac] filename = plotsdir + "prefix_idac" + fname + ".csv" df.overwrite(filename, df.list2col(idac)) ddac = sorted([(k, np.mean([q[0] for q in ddc[k]]), np.mean([q[1] for q in \ ddc[k]])) for k in ddc], key=lambda z: z[2], reverse=True) ddac = [(z[0], z[1]) for z in ddac] filename = plotsdir + "prefix_ddac" + fname + ".csv" df.overwrite(filename, df.list2col(ddac)) iic = sorted([(k, iic[k][0], iic[k][1]) for k in iic], reverse=True, key=lambda z: z[2]) iic = [(z[0], z[1]) for z in iic] filename = plotsdir + "prefix_iic" + fname + ".csv" df.overwrite(filename, df.list2col(iic)) svd = dict() for sv in svl: svd[sv.get_id()] = sv return svd
def plot_measure_expansion(start_time, fname="", loops=31, gap=0, thresh=10, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param gap: the gap (in seconds) between each iteration's dataset :param loops: the number of iterations (datasets) :param **kwas: keyword arguments for vv.get_svl() line plot: x -> n (such that it represents the nth iteration) y -> # of new IPs observed by a client on nth iteration line -> each line corresponds to one domain ''' svld, allsvl, allfmt, anssets = mv.arrange_self_data(start_time, gap, loops, **kwas) keys = svld.keys() counts = mv.measure_expansion(svld) domvals = defaultdict(lambda: defaultdict(list)) allvals = defaultdict(list) smallvals = defaultdict(list) for c in counts: for dom in c: for i, val in enumerate(c[dom]['ratio']): allvals[i].append(val) if len(anssets[dom]) < thresh: smallvals[i].append(val) else: domvals[dom][i].append(val) labels = ['all', 'small'] + domvals.keys() vals = list() for i in labels: vals.append([]) for i in sorted(allvals.keys()): vals[0].append(np.mean(allvals[i])) vals[1].append(np.mean(smallvals[i])) for j, dom in enumerate(labels[2:]): vals[j+2].append(np.mean(domvals[dom][i])) fig, ax = plt.subplots(1, 1) marker = ps.get_markers() style = ps.get_styles() for i in xrange(0, len(vals)): ax.plot(vals[i], label=labels[i], fillstyle='full', marker=next(marker), markerfacecolor='white', markevery=6, linestyle=next(style)) ps.set_dim(fig, ax) plt.xlabel("cycle #") plt.ylabel("# new IPs / ans. size") ax.grid(b=True, which='major', color='b', linestyle='-') lgd = ps.legend_setup(ax, 3, "top right", True) filename = plotsdir+"expansion"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." for i in xrange(0, len(vals)): outstr = df.overwrite(plotsdir+labels[i]+'newvssize.csv', df.list2col(vals[i]))
def plot_self_match(start_time, duration, fname="", loops=7, gap=0, thresh=10, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param gap: the gap (in seconds) between each iteration's dataset :param loops: the number of iterations (datasets) / 2 (since we need 2 per comparison in this case) :param **kwas: keyword arguments for vv.get_svl() plots CDF: x -> closeness to self for back-to-back iteration windows (e.g., days 1-2 vs days 3-4, days 5-6 vs days 7-8, ...) y -> CDF of clients NOTE: plot 3.2 ''' valsd = defaultdict(list) # {pid: [vals]} bigsvld = dict() kwas['duration'] = duration kwas['return_ccache'] = False for i in xrange(0, loops): print (start_time+2*duration*i, duration) tmp_start = start_time+2*(gap+duration)*i svld, allsvl, allfmt, anssets = mv.arrange_self_data(tmp_start, gap, 2, **kwas) pids, vals = mv.self_match(svld) for pid, val in zip(pids, vals): valsd[pid].append(val) bigsvld[pid] = svld[pid] results = list() for pid in valsd: results.append((pid, np.mean(valsd[pid]))) results = sorted(results, key=lambda z: z[1], reverse=True) fig, ax = plt.subplots(1, 1) ecdf = ECDF([z[1] for z in results]) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, color='k') ax.axvline(np.median(x), color='r', linestyle='--') ps.set_dim(fig, ax, xdim=13, ydim=7.5) plt.xlabel("closeness to self") plt.ylabel("CDF of clients") filename = plotsdir+"self_closeness"+fname fig.savefig(filename+'.png', bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_inches='tight') plt.close(fig) labels = defaultdict(list) for pid, val in results: labels['countries'].append((val, bigsvld[pid][0].get_country())) labels['subnets'].append((val, bigsvld[pid][0].get_subnet())) labels['prefixes'].append((val, bigsvld[pid][0].get_prefix())) labels['asns'].append((val, bigsvld[pid][0].get_asn())) labels['resolvers'].append((val, bigsvld[pid][0].get_ldns())) for k in labels: data = sorted([(y, np.mean([z[0] for z in labels[k] if z[1] == y]), len([z[0] for z in labels[k] if z[1] == y])) \ for y in set([v[1] for v in labels[k]])], key=lambda x: x[1]) df.overwrite(plotsdir+'self_closeness_'+k+fname+'.csv', df.list2col(data)) print "saving data..." outstr = df.overwrite(plotsdir+'self_closeness'+fname+'.csv', df.list2col(results))
def plot_closeness_same_desc(start_time, duration, fname="", xlim=[.6, 1.0], rmask=16, loops=31, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :param xlim: x axis limits for plot. Accepts formats: None, [a, b], :param rmask: mask for resolver IPs :param **kwas: keyword arguments for vv.get_svl() for each descriptor (ASN, country, registered prefix, /24 subnet), plot the CDF of the pairwise closeness of clients, such that the clients in a pair come from the same groups in the descriptor (e.g., same country for the country descriptor) NOTE: plot 4.1 ''' lvals = list() cvals = list() avals = list() svals = list() pvals = list() kwas['duration'] = duration for l in xrange(0, loops): print "getting svl..." kwas['start_time'] = start_time+duration*l svl, fmt, __, ccache = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) print "getting descriptor lists..." csvl = vv.country_svl(svl) asvl = vv.asn_svl(svl) ssvl = vv.subnet_svl(svl) #osvl = vv.owner_svl(svl) psvl = vv.prefix_svl(svl) lsvl = vv.ldns_svl(svl, rmask, False) fmtmask = ipp.make_v4_prefix_mask(rmask) to_remove = [ '208.67.222.123', # OpenDNS '208.67.220.123', '8.8.8.8', # Google Public DNS '8.8.4.4', '64.6.64.6', # Verisign '64.6.65.6'] # remove massive public DNS providers for ip in to_remove: tmp = ipp.ip2int(ip) & fmtmask if tmp in lsvl: del lsvl[tmp] print "calculating closeness for resolvers..." resolvers = lsvl.keys() for k in resolvers: ksvl = lsvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): lvals.append(ccache[ksvl[a]][ksvl[b]]) print "calculating closeness for countries..." countries = csvl.keys() for k in countries: ksvl = csvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): cvals.append(ccache[ksvl[a]][ksvl[b]]) print "calculating closeness for ASNs..." asns = asvl.keys() for k in asns: ksvl = asvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): avals.append(ccache[ksvl[a]][ksvl[b]]) print "calculating closeness for subnets..." subnets = ssvl.keys() for k in subnets: ksvl = ssvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): svals.append(ccache[ksvl[a]][ksvl[b]]) ''' print "calculating closeness for owners..." ovals = list() owners = osvl.keys() for k in owners: ksvl = osvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): ovals.append(ccache[ksvl[a]][ksvl[b]]) ''' print "calculating closeness for prefixes..." prefixes = psvl.keys() for k in prefixes: ksvl = psvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): pvals.append(ccache[ksvl[a]][ksvl[b]]) print "plotting..." #vals = [cvals, avals, svals, ovals, pvals] #labels = ['country', 'ASN', 'subnet', 'owner', 'prefix'] vals = [cvals, avals, svals, pvals, lvals] labels = ['country', 'ASN', 'subnet', 'prefix', 'resolver'] fig, ax = plt.subplots(1, 1) for i in xrange(0, len(vals)): print type(vals[i][0]) print labels[i], "\n" print len(vals[i]) ecdf = ECDF(np.array(vals[i])) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label=labels[i]) ps.set_dim(fig, ax, xdim=13, ydim=7.5) plt.xlabel("pairwise probe closeness") plt.ylabel("CDF of pairs") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"closeness_same_desc"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." for i in xrange(0, len(vals)): outstr = df.overwrite(plotsdir+labels[i]+'_same.csv', df.list2col(vals[i])) ccache.dump()
def plot_closeness(start_time, duration, fname="", xlim=[.6, 1.0], loops=15, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :param xlim: x axis limits for plot. Accepts formats: None, [a, b], :param loops: number of time blocks :param **kwas: keyword arguments for vv.get_svl() plots: 1) CDF for pairwise closeness of each pair 2) CDF for the average pairwise closeness experienced by each probe across all other probes NOTE: plot 3.1 ''' means = defaultdict(list) vals = list() kwas['duration'] = duration for l in xrange(0, loops): print "getting svl..." kwas['start_time'] = start_time+duration*l svl, __, __, ccache = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) print len(svl) print "calculating closeness for resolvers..." for i in xrange(0, len(svl)-1): for j in xrange(i + 1, len(svl)): vals.append(ccache[svl[i]][svl[j]]) means[svl[i].get_id()].append(vals[-1]) means[svl[j].get_id()].append(vals[-1]) ccache.dump() del ccache, svl, __ gc.collect() print "plotting..." fig, ax = plt.subplots(1, 1) ecdf = ECDF(vals) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label="pairwise") ecdf = ECDF([np.mean(means[z]) for z in means]) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label="average (per client)") ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim) plt.xlabel("pairwise probe closeness") plt.ylabel("CDF of pairs") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"overall_closeness"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." df.overwrite(plotsdir+'overall_closeness'+fname+'.csv', df.list2col(vals)) df.overwrite(plotsdir+'overall_avg_closeness'+fname+'.csv', df.list2col([(z, np.mean(means[z])) for z in means]))
def plot_optimizing_window(start_time, duration, fname="", xlim=None, maxdur=90000*15, incr=30000, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :param xlim: x axis limits for plot. Accepts formats: None, [a, b], :param maxdur: the outer bound of the duration range to be covered :param incr: the number of seconds to increment the duration by in each loop :param **kwas: keyword arguments for vv.get_svl() makes line plot varying the duration (x axis) vs the closeness to one's self from a different point in time (e.g., for a 10 second duration, self A would be time 0-9, and self B would be time 10-19) ''' allvals = list() allbars = list() allx = list() dur = duration kwas['return_ccache'] = False while dur < maxdur: print "getting svls..." kwas['duration'] = dur kwas['start_time'] = start_time svl, __, __ = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) svl1 = dict() for sv in svl: svl1[sv.id] = sv kwas['start_time'] = start_time+dur svl, __, __ = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) svl2 = dict() for sv in svl: svl2[sv.id] = sv print "calculating closeness for subnets...", dur vals = list() for pid in svl1: if pid in svl2: vals.append(vv.closeness(svl1[pid], svl2[pid])) allvals.append(np.mean(vals)) allbars.append(np.std(vals)) allx.append(float(dur)/(60.0*60.0*8.0)) dur += incr fig, ax = plt.subplots(1, 1) ax.errorbar(allx, allvals, yerr=allbars) ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim) plt.xlabel("# 8 hour cycles in block duration") plt.ylabel("average self closeness") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"avg_self_closeness"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." outstr = df.overwrite(plotsdir+fname+'_avg_self_closeness.csv', df.list2col(allvals))
def plot_fmeasure(start_time, method="complete", fname="", Zf=False, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param method: the linkage method to be used :param fname: string to be appended to end of plot file name :param **kwas: keyword arguments for vv.get_svl() scatter plot: x -> max distance threshold y -> f-measure lineplot -> # components (y) other output: list of desc. that shared optimal components ''' Z, svl = get_zx(start_time, method, fname, Zf, **kwas) dsvl = dict() dsvl['country'] = vv.country_svl(svl) dsvl['asn'] = vv.asn_svl(svl) dsvl['subnet'] = vv.subnet_svl(svl) dsvl['prefix'] = vv.prefix_svl(svl) dsvl['ldns'] = vv.ldns_svl(svl, rmask, False) fmtmask = ipp.make_v4_prefix_mask(rmask) to_remove = [ '208.67.222.123', # OpenDNS '208.67.220.123', '8.8.8.8', # Google Public DNS '8.8.4.4', '64.6.64.6', # Verisign '64.6.65.6'] # remove massive public DNS providers for ip in to_remove: tmp = ipp.ip2int(ip) & fmtmask if tmp in lsvl: del lsvl[tmp] vals = defaultdict(list) grouping = dict() count = list() for max_dist in np.arange(0, 1.01, .01): data = defaultdict(lambda: defaultdict(list)) labels = fcluster(Z, max_dist, criterion='distance') clusters = [[(c, svl[z]) for z, c in enumerate(labels) if c == y] \ for y in set(labels)] if len(clusters) > 1 and len(clusters) < len(svl): count.append(max_dist, len(clusters)) for c, blob in clusters: for desc in dsvl: cluster = [getattr(sv,"get_"+desc)() for sv in blob] for d in set(cluster): localcount = float(len([z for z in cluster if z == d])) localsize = float(len(cluster)) globalsize = float(len(dsvl[desc][d])) precision = localcount / localsize recall = localcount / globalsize fmeasure = (2*precision*recall)/(precision+recall) data[desc][d].append((fmeasure, c, max_dist)) for desc in data: for d in data[desc]: maxf, maxc, maxd = max(data[desc][d], key=lambda z: z[0]) vals[desc].append((maxf, maxd)) grouping[(desc, maxc, maxd)].append((d, maxf)) print "plotting..." fig, ax = plt.subplots(1, 1) vals['resolver'] = vals.pop('ldns') colors = iter(cm.rainbow(np.linspace(0, 1, len(vals)))) for desc in vals: y, x = zip(*vals[desc]) heatmap, xedges, yedges = np.histogram2d(x, y, bins=50) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.clf() plt.imshow(heatmap.T, extent=extent, origin='lower') cb = PLT.colorbar() cb.set_label('# of '+make_plural(desc)) plt.xlabel("max distance threshold") plt.ylabel("F-measure") ax.grid(b=True, which='both', color='b', linestyle='-') filename = plotsdir+"fmeasure_"+desc+fname fig.savefig(filename+'.png', bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_inches='tight') plt.close(fig) fig, ax = plt.subplots(1, 1) x, y = zip(*count) ax.plot(x, y) plt.xlabel("max distance threshold") ax.set_ylabel('# components') ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim) lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"component_count"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." df.overwrite(plotsdir+'fmeasure_groups'+fname+'.csv', df.list2col(vals)) groups = [(z[0], groups[z]) for z in groups if len(groups[z]) > 1] if len(groups) > 0: groups = sorted(groups, key=lambda z: z[0]+str(z[1])+str(z[2])) df.overwrite(plotsdir+"fmeasure_groups"+fname+".csv", df.list2col(groups))
qfs = qfb.get_filter() print qfs, "\n" msms = jcr.get_measurements(qfs) print "priming..." msms = jcr.prime_measurements(msms, 'domain') rfb = fb.result_filter('dns') rfb.set_min_probes(1000) rfb.set_time_window(60 * 60 * 24, 2017, 7, 21) rfb.manual_set(use_probe_resolver=True) rfs = rfb.get_filter() print rfs print "filtering..." msms = jcr.filter_measurements(msms, rfs) print 'msms len', len(msms) g = None doms = set() for m in msms: if m['domain'] is not None: if 'ripe' not in m['domain']: doms.add((m['domain'], m['id'])) df.overwrite('doms.csv', df.list2col(sorted(doms, key=lambda z: z[0]))) df.pickleout('msms.pickle', msms)