コード例 #1
0
def make_homogeneity_and_completeness(workers=2, **kwargs):
    ''' also gets homogeneity '''
    global g_scb
    g_scb = skyclusters.SkyClusterBuilder(**kwargs)
    g_scb.load_matrix_from_file('datadir/matrix/matrix.json')
    with open(g_scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f:
        g_scb.kwargs['counts'] = pkl.load(f)
    global g_ca
    g_ca = clusterAnalysis.ClusterAnalysis(scb=g_scb)
    skyclusters.gc.collect()
    categories = ['country', 'asn', 'prefix', 'ip24']
    for category in categories:
        data = list(set(g_scb.nodes.probes_df[category]))
        with open(
                g_scb.fmt_path(
                    'datadir/homogeneity_and_completeness/labelsets/' +
                    category + '.json'), 'w') as f:
            json.dump(data, f)
    thresholds = np.arange(0.05, 1.0, 0.05)
    itr = itertools.product(categories, thresholds)
    global q
    q = Queue()
    pool = Pool(workers)
    dumper = Process(target=cycle_worker, args=(q, ))
    dumper.start()
    for res in pool.imap_unordered(get_homogeneity_and_completeness, itr):
        q.put((dump_homogeneity_and_completeness, res))
    q.put(('exit', True))
    dumper.join()
コード例 #2
0
def homogeneity_and_completeness():
    scb = SC.SkyClusterBuilder()
    with open(scb.fmt_path('datadir/matrix/matrix.json'), 'r') as f:
        scb.matrix = json.load(f)
    scb.reduce_matrix_to_sampled()
    ca = CA.ClusterAnalysis(scb=scb)
    clusters = ca.get_clusters(0.4)
    data = ca.get_homogeneity_and_completeness(clusters, 'asn')
    with open('test_homogeneity_and_completeness.json', 'a') as f:
        f.write(json.dumps([0.5, data])+'\n')
コード例 #3
0
def make_homogeneity_and_completeness_for_resolvers(workers=2, **kwargs):
    ''' also gets homogeneity '''
    global g_scb
    g_scb = skyclusters.SkyClusterBuilder(**kwargs)
    g_scb.load_matrix_from_file('datadir/matrix/matrix.json')
    '''
    2 things happening here:
        1) reducing probes to those with at least one public resolver
        2) picking the most "common" (across all probes) resolver observed by probe to label said probe
    '''
    R = skyresolvers.Resolvers()
    invR = R.get_inverse()
    keep = set()
    for resolver in invR:
        keep.update(invR[resolver])
    g_scb.nodes._probes_df = g_scb.nodes._probes_df.loc[
        g_scb.nodes._probes_df['probe'].isin(keep)]
    g_scb.reduce_matrix_to_sampled()
    with open(g_scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f:
        g_scb.kwargs['counts'] = pkl.load(f)
    keys = sorted(list(invR.keys()), key=lambda z: len(invR[z]))
    probes = dict()
    while len(keep):
        key = keys.pop()
        for probe in invR[key]:
            if probe in keep:
                probes[probe] = key
                keep.remove(probe)
                if len(keep) == 0:
                    break
    resolvers = [probes[z] for z in g_scb.nodes._probes_df.probe.to_list()]
    resolvers_set = list(set(resolvers))
    with open(
            g_scb.fmt_path(
                'datadir/homogeneity_and_completeness/labelsets/resolvers.json'
            ), 'w') as f:
        json.dump(resolvers_set, f)
    g_scb.nodes._probes_df = g_scb.nodes._probes_df.assign(resolvers=resolvers)
    global g_ca
    g_ca = clusterAnalysis.ClusterAnalysis(scb=g_scb)
    skyclusters.gc.collect()
    categories = ['resolvers']
    thresholds = np.arange(0.05, 1.0, 0.05)
    itr = itertools.product(categories, thresholds)
    global q
    q = Queue()
    pool = Pool(workers)
    dumper = Process(target=cycle_worker, args=(q, ))
    dumper.start()
    for res in pool.imap_unordered(get_homogeneity_and_completeness, itr):
        q.put((dump_homogeneity_and_completeness, res))
    q.put(('exit', True))
    dumper.join()
コード例 #4
0
ファイル: crne_plots.py プロジェクト: mwarrior92/skylines
def make_domain_error(**kwargs):
    scb = skyclusters.SkyClusterBuilder(**kwargs)
    scb.nodes.keep_only({'results'})

    with open(scb.fmt_path('datadir/matrix/matrix.json'), 'r') as f:
        tmpmatrix = json.load(f)
        skyclusters.g_matrix = RawArray('d', len(tmpmatrix))
        for i, z in enumerate(tmpmatrix):
            skyclusters.g_matrix[i] = z
        scb.matrix = skyclusters.g_matrix
        del tmpmatrix
    skyclusters.gc.collect()
    scb.domain_error()
    scb.condense_domain_error()
    scb.plot_domain_error()
コード例 #5
0
def make_pings_vs_cnre(workers=2, chunksize=500):
    global g_scb
    g_scb = skyclusters.SkyClusterBuilder(limit=500)
    g_scb.load_matrix_from_file('datadir/matrix/matrix.json')
    with open(g_scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f:
        g_scb.kwargs['counts'] = pkl.load(f)
    skyclusters.gc.collect()
    pings = Pings()
    global g_means
    g_means = RawArray('d', itertools.repeat(0, len(g_scb.nodes)))
    for i in range(len(g_scb.nodes)):
        node = g_scb.nodes[i]
        try:
            flat = pings.get_flat_pings(node.probe)
        except:
            flat = None
        if flat is None or len(flat) == 0:
            continue
        g_means[i] = np.mean(flat)
    del pings
    pool = Pool(workers)
    global q
    q = Queue()
    dumper = Process(target=cycle_worker, args=(q, ))
    dumper.start()
    data = list()
    for res in pool.imap_unordered(get_ping_vs_cnre, range(len(g_scb.nodes)),
                                   chunksize):
        data.append(res)
        if len(data) >= 10000:
            q.put((dump_pings_vs_cnre, data))
            del data
            data = list()
    if len(data) > 0:
        q.put((dump_pings_vs_cnre, data))
    q.put(('exit', True))
    dumper.join()
コード例 #6
0
def make_geo_vs_cnre(workers=2, chunksize=500):
    global g_scb
    g_scb = skyclusters.SkyClusterBuilder(limit=500)
    g_scb.load_matrix_from_file('datadir/matrix/matrix.json')
    with open(g_scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f:
        g_scb.kwargs['counts'] = pkl.load(f)
    skyclusters.gc.collect()
    pool = Pool(workers)
    global q
    q = Queue()
    dumper = Process(target=cycle_worker, args=(q, ))
    dumper.start()
    data = list()
    for res in pool.imap_unordered(get_geo_vs_cnre, range(len(g_scb.matrix)),
                                   chunksize):
        data.append(res)
        if len(data) >= 10000:
            q.put((dump_geo_vs_cnre, data))
            del data
            data = list()
    if len(data) > 0:
        q.put((dump_geo_vs_cnre, data))
    q.put(('exit', True))
    dumper.join()
コード例 #7
0
def make_dendrogram():
    scb = skyclusters.SkyClusterBuilder(limit=500)

    with open(scb.fmt_path('datadir/pkls/answer_counts.pkl'), 'r') as f:
        scb.kwargs['counts'] = pkl.load(f)
    scb.make_dendrogram(no_labels=True, truncate_mode='lastp', p=50)
コード例 #8
0
def plot_closeness_for_category(**kwargs):
    global scb
    scb = skyclusters.SkyClusterBuilder(**kwargs)
    scb.load_matrix_from_file('datadir/matrix/matrix.json')
    global nodes
    nodes = scb.nodes.probes_df
    categories = ['asn', 'prefix', 'country', 'ip24']
    labels = [(category, set(nodes[category].tolist()))
              for category in categories]
    labels = [y for z in labels for y in list(itertools.product([z[0]], z[1]))]
    global g_inds
    g_inds = defaultdict(dict)
    for category in categories:
        lbls = set(nodes[category].to_list())
        for lbl in lbls:
            g_inds[category][lbl] = nodes.loc[nodes[category] ==
                                              lbl].idx.to_list()
    sames = defaultdict(list)
    diffs = defaultdict(list)
    sizes = list()
    count = 0
    fname = scb.fmt_path('datadir/closeness_vs_category/data.json')
    with open(fname, 'w') as f:
        f.write('')
    data = list()
    pool = Pool(2)
    for res in pool.imap_unordered(get_same_diff_for_pair, labels, 100):
        try:
            sames[res['c']].append(res['sm'])
            # (label, median, size)
            sizes.append((res['l'], res['smn'], res['sz'], str(res['std'])))
        except KeyError:
            pass
        diffs[res['c']].append(res['df'])
        data.append(res)
        if count % 100 == 0:
            print('\n' + str(res) + '\n')
            sys.stdout.flush()
            with open(fname, 'a') as f:
                f.write(json.dumps(data) + '\n')
            data = list()
        count += 1
    if data:
        with open(fname, 'a') as f:
            f.write(json.dumps(data) + '\n')
        del data

    data = dict()
    for category in diffs:
        fig, (ax) = plt.subplots(1, 1, figsize=(4, 4))
        styles = itertools.cycle(['-', '--', '-.', ':'])
        ecdf = ECDF(diffs[category])
        lines = list()
        xd, yd = list(ecdf.x), list(ecdf.y)
        lines += ax.plot(xd, yd, next(styles), label='diff')
        ecdf = ECDF(sames[category])
        xs, ys = list(ecdf.x), list(ecdf.y)
        lines += ax.plot(xs, ys, next(styles), label='same')
        m = np.percentile(diffs[category], 95)
        ax.axvline(m, color='r', linewidth=0.7)
        ax.annotate("{:.2f}".format(m), (m, 0.8),
                    textcoords="offset points",
                    ha='center',
                    fontsize=16,
                    backgroundcolor='white')
        ax.legend(lines, [z.get_label() for z in lines])
        ax.set_xlim([0, 1.0])
        ax.set_ylim([0, 1.0])
        ax.set_xlabel('median cnre')
        ax.set_ylabel('CDF')
        fig.savefig(
            scb.fmt_path('plotsdir/closeness_vs_category/' + category +
                         '.png'))
        plt.close(fig)
        with open(
                scb.fmt_path('datadir/closeness_vs_category/' + category +
                             '.json'), 'w') as f:
            json.dump({'same': (xs, ys), 'diff': (xd, yd), 'm': m}, f)
    fig3, (ax3) = plt.subplots(1, 1)
    _, x, y, stds = zip(*sizes)
    scatter = ax3.scatter(x, y, c=stds, edgecolors='k')
    ax3.set_xlabel('mean cnre')
    ax3.set_ylabel('label group size')
    plt.colorbar(scatter, ticks=np.arange(0.0, 1.01, 0.2))
    fig3.savefig(scb.fmt_path('plotsdir/closeness_vs_category/size.png'))
    plt.close(fig3)
    with open(scb.fmt_path('datadir/closeness_vs_category/size.json'),
              'w') as f:
        json.dump({'means': x, 'sizes': y, 'stds': stds}, f)