def make_homogeneity_and_completeness(workers=2, **kwargs): ''' also gets homogeneity ''' global g_scb g_scb = skyclusters.SkyClusterBuilder(**kwargs) g_scb.load_matrix_from_file('datadir/matrix/matrix.json') with open(g_scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f: g_scb.kwargs['counts'] = pkl.load(f) global g_ca g_ca = clusterAnalysis.ClusterAnalysis(scb=g_scb) skyclusters.gc.collect() categories = ['country', 'asn', 'prefix', 'ip24'] for category in categories: data = list(set(g_scb.nodes.probes_df[category])) with open( g_scb.fmt_path( 'datadir/homogeneity_and_completeness/labelsets/' + category + '.json'), 'w') as f: json.dump(data, f) thresholds = np.arange(0.05, 1.0, 0.05) itr = itertools.product(categories, thresholds) global q q = Queue() pool = Pool(workers) dumper = Process(target=cycle_worker, args=(q, )) dumper.start() for res in pool.imap_unordered(get_homogeneity_and_completeness, itr): q.put((dump_homogeneity_and_completeness, res)) q.put(('exit', True)) dumper.join()
def homogeneity_and_completeness(): scb = SC.SkyClusterBuilder() with open(scb.fmt_path('datadir/matrix/matrix.json'), 'r') as f: scb.matrix = json.load(f) scb.reduce_matrix_to_sampled() ca = CA.ClusterAnalysis(scb=scb) clusters = ca.get_clusters(0.4) data = ca.get_homogeneity_and_completeness(clusters, 'asn') with open('test_homogeneity_and_completeness.json', 'a') as f: f.write(json.dumps([0.5, data])+'\n')
def make_homogeneity_and_completeness_for_resolvers(workers=2, **kwargs): ''' also gets homogeneity ''' global g_scb g_scb = skyclusters.SkyClusterBuilder(**kwargs) g_scb.load_matrix_from_file('datadir/matrix/matrix.json') ''' 2 things happening here: 1) reducing probes to those with at least one public resolver 2) picking the most "common" (across all probes) resolver observed by probe to label said probe ''' R = skyresolvers.Resolvers() invR = R.get_inverse() keep = set() for resolver in invR: keep.update(invR[resolver]) g_scb.nodes._probes_df = g_scb.nodes._probes_df.loc[ g_scb.nodes._probes_df['probe'].isin(keep)] g_scb.reduce_matrix_to_sampled() with open(g_scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f: g_scb.kwargs['counts'] = pkl.load(f) keys = sorted(list(invR.keys()), key=lambda z: len(invR[z])) probes = dict() while len(keep): key = keys.pop() for probe in invR[key]: if probe in keep: probes[probe] = key keep.remove(probe) if len(keep) == 0: break resolvers = [probes[z] for z in g_scb.nodes._probes_df.probe.to_list()] resolvers_set = list(set(resolvers)) with open( g_scb.fmt_path( 'datadir/homogeneity_and_completeness/labelsets/resolvers.json' ), 'w') as f: json.dump(resolvers_set, f) g_scb.nodes._probes_df = g_scb.nodes._probes_df.assign(resolvers=resolvers) global g_ca g_ca = clusterAnalysis.ClusterAnalysis(scb=g_scb) skyclusters.gc.collect() categories = ['resolvers'] thresholds = np.arange(0.05, 1.0, 0.05) itr = itertools.product(categories, thresholds) global q q = Queue() pool = Pool(workers) dumper = Process(target=cycle_worker, args=(q, )) dumper.start() for res in pool.imap_unordered(get_homogeneity_and_completeness, itr): q.put((dump_homogeneity_and_completeness, res)) q.put(('exit', True)) dumper.join()
def make_domain_error(**kwargs): scb = skyclusters.SkyClusterBuilder(**kwargs) scb.nodes.keep_only({'results'}) with open(scb.fmt_path('datadir/matrix/matrix.json'), 'r') as f: tmpmatrix = json.load(f) skyclusters.g_matrix = RawArray('d', len(tmpmatrix)) for i, z in enumerate(tmpmatrix): skyclusters.g_matrix[i] = z scb.matrix = skyclusters.g_matrix del tmpmatrix skyclusters.gc.collect() scb.domain_error() scb.condense_domain_error() scb.plot_domain_error()
def make_pings_vs_cnre(workers=2, chunksize=500): global g_scb g_scb = skyclusters.SkyClusterBuilder(limit=500) g_scb.load_matrix_from_file('datadir/matrix/matrix.json') with open(g_scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f: g_scb.kwargs['counts'] = pkl.load(f) skyclusters.gc.collect() pings = Pings() global g_means g_means = RawArray('d', itertools.repeat(0, len(g_scb.nodes))) for i in range(len(g_scb.nodes)): node = g_scb.nodes[i] try: flat = pings.get_flat_pings(node.probe) except: flat = None if flat is None or len(flat) == 0: continue g_means[i] = np.mean(flat) del pings pool = Pool(workers) global q q = Queue() dumper = Process(target=cycle_worker, args=(q, )) dumper.start() data = list() for res in pool.imap_unordered(get_ping_vs_cnre, range(len(g_scb.nodes)), chunksize): data.append(res) if len(data) >= 10000: q.put((dump_pings_vs_cnre, data)) del data data = list() if len(data) > 0: q.put((dump_pings_vs_cnre, data)) q.put(('exit', True)) dumper.join()
def make_geo_vs_cnre(workers=2, chunksize=500): global g_scb g_scb = skyclusters.SkyClusterBuilder(limit=500) g_scb.load_matrix_from_file('datadir/matrix/matrix.json') with open(g_scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f: g_scb.kwargs['counts'] = pkl.load(f) skyclusters.gc.collect() pool = Pool(workers) global q q = Queue() dumper = Process(target=cycle_worker, args=(q, )) dumper.start() data = list() for res in pool.imap_unordered(get_geo_vs_cnre, range(len(g_scb.matrix)), chunksize): data.append(res) if len(data) >= 10000: q.put((dump_geo_vs_cnre, data)) del data data = list() if len(data) > 0: q.put((dump_geo_vs_cnre, data)) q.put(('exit', True)) dumper.join()
def make_dendrogram(): scb = skyclusters.SkyClusterBuilder(limit=500) with open(scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f: scb.kwargs['counts'] = pkl.load(f) scb.make_dendrogram(no_labels=True, truncate_mode='lastp', p=50)
def plot_closeness_for_category(**kwargs): global scb scb = skyclusters.SkyClusterBuilder(**kwargs) scb.load_matrix_from_file('datadir/matrix/matrix.json') global nodes nodes = scb.nodes.probes_df categories = ['asn', 'prefix', 'country', 'ip24'] labels = [(category, set(nodes[category].tolist())) for category in categories] labels = [y for z in labels for y in list(itertools.product([z[0]], z[1]))] global g_inds g_inds = defaultdict(dict) for category in categories: lbls = set(nodes[category].to_list()) for lbl in lbls: g_inds[category][lbl] = nodes.loc[nodes[category] == lbl].idx.to_list() sames = defaultdict(list) diffs = defaultdict(list) sizes = list() count = 0 fname = scb.fmt_path('datadir/closeness_vs_category/data.json') with open(fname, 'w') as f: f.write('') data = list() pool = Pool(2) for res in pool.imap_unordered(get_same_diff_for_pair, labels, 100): try: sames[res['c']].append(res['sm']) # (label, median, size) sizes.append((res['l'], res['smn'], res['sz'], str(res['std']))) except KeyError: pass diffs[res['c']].append(res['df']) data.append(res) if count % 100 == 0: print('\n' + str(res) + '\n') sys.stdout.flush() with open(fname, 'a') as f: f.write(json.dumps(data) + '\n') data = list() count += 1 if data: with open(fname, 'a') as f: f.write(json.dumps(data) + '\n') del data data = dict() for category in diffs: fig, (ax) = plt.subplots(1, 1, figsize=(4, 4)) styles = itertools.cycle(['-', '--', '-.', ':']) ecdf = ECDF(diffs[category]) lines = list() xd, yd = list(ecdf.x), list(ecdf.y) lines += ax.plot(xd, yd, next(styles), label='diff') ecdf = ECDF(sames[category]) xs, ys = list(ecdf.x), list(ecdf.y) lines += ax.plot(xs, ys, next(styles), label='same') m = np.percentile(diffs[category], 95) ax.axvline(m, color='r', linewidth=0.7) ax.annotate("{:.2f}".format(m), (m, 0.8), textcoords="offset points", ha='center', fontsize=16, backgroundcolor='white') ax.legend(lines, [z.get_label() for z in lines]) ax.set_xlim([0, 1.0]) ax.set_ylim([0, 1.0]) ax.set_xlabel('median cnre') ax.set_ylabel('CDF') fig.savefig( scb.fmt_path('plotsdir/closeness_vs_category/' + category + '.png')) plt.close(fig) with open( scb.fmt_path('datadir/closeness_vs_category/' + category + '.json'), 'w') as f: json.dump({'same': (xs, ys), 'diff': (xd, yd), 'm': m}, f) fig3, (ax3) = plt.subplots(1, 1) _, x, y, stds = zip(*sizes) scatter = ax3.scatter(x, y, c=stds, edgecolors='k') ax3.set_xlabel('mean cnre') ax3.set_ylabel('label group size') plt.colorbar(scatter, ticks=np.arange(0.0, 1.01, 0.2)) fig3.savefig(scb.fmt_path('plotsdir/closeness_vs_category/size.png')) plt.close(fig3) with open(scb.fmt_path('datadir/closeness_vs_category/size.json'), 'w') as f: json.dump({'means': x, 'sizes': y, 'stds': stds}, f)