Example #1
0
def plot_homogeneity_and_completeness2(category):
    print(category)
    D = DataGetter()
    data = pandas.read_json(
        D.fmt_path('datadir/homogeneity_and_completeness/' + category +
                   '.json'),
        lines=True)
    data = data.sort_values(by=['threshold'])
    with open(
            D.fmt_path('datadir/homogeneity_and_completeness/labelsets/' +
                       category + '.json'), 'r') as f:
        labelset = json.load(f)
    x = data.threshold.to_list()
    y1 = data.homogeneity.to_list()
    y2 = data.completeness.to_list()
    y3 = data.nclusters.to_list()
    fig, (ax) = plt.subplots(1, 1)
    ax2 = ax.twinx()
    l1 = ax.plot(x, y1, '--', label='homogeneity')
    l2 = ax.plot(x, y2, label='completeness')
    l3 = ax2.plot(x, y3, 'r:', label='# clusters')
    ax2.axhline(len(labelset), color='k')
    #vind, _ = min([(i,abs(n-len(labelset))) for i,n in enumerate(y3)], key=lambda z: z[1])
    #ax.axvline(x[vind], color='green')
    #ax.axvline(1.0-0.73, color='green')
    lines = l1 + l2 + l3
    ax2.set_yscale('log')
    ax.set_xlabel('distance threshold')
    ax.set_ylabel('Homog. & Compl.')
    ax2.set_ylabel('# clusters', rotation=270)
    ax.set_ylim([0, 1])
    fig.savefig(
        D.fmt_path('plotsdir/homogeneity_and_completeness_presentation/' +
                   category + '2.png'))
def plot_domain_alignment():
    '''
    all_sets = list()
    pool = Pool(6)
    for tmp_counts in pool.imap_unordered(get_domain_alignment, range(len(g_clusters))):
        if tmp_counts:
            all_sets.append(tmp_counts)
    try:
        with open(g_ca.fmt_path('datadir/domain_alignment/raw.json'),'w') as f:
            json.dump(all_sets,f)
    except:
        print('failed to save raw')
    data = list()
    means = dict()
    for i, cluster in enumerate(all_sets):
        alns, sizes, perfs = zip(*cluster.values())
        mean_aln = np.mean(alns)
        perfs = [z for z in perfs if z > 0]
        if perfs:
            mean_perf = np.mean(perfs)
        else:
            mean_perf = None
        means[i] = (mean_aln, mean_perf)
        for dom, val in cluster.items():
            aln, s, perf = val
            data.append((dom, aln - mean_aln,
                perf - mean_perf if mean_perf and perf else None))
    with open(g_ca.fmt_path('datadir/domain_alignment/deviations.json'),'w') as f:
        json.dump(data,f)
    '''
    D = DataGetter()
    #with open(D.fmt_path('datadir/domain_alignment/deviations.json'),'r') as f:
    with open(D.fmt_path('datadir/deviations.json'), 'r') as f:
        data = json.load(f)
    doms, aln_devs, perf_devs = zip(*data)
    fig, ax = plt.subplots(figsize=(6, 3.5))
    ecdf = ECDF(aln_devs)
    ax.plot(list(ecdf.x), list(ecdf.y))
    ax.set_xlabel('distance from mean alignment')
    ax.set_ylabel('CDF')
    fig.savefig(D.fmt_path('plotsdir/domain_alignment/alignment.png'))
    plt.close(fig)
    fig, ax = plt.subplots(figsize=(4.5, 4.5))
    aln_devs, perf_devs = zip(
        *[z for z in zip(aln_devs, perf_devs) if z[1] is not None])
    heatmap, x, y = np.histogram2d(aln_devs, perf_devs, bins=50)
    extent = [x[0], x[-1], y[0], y[-1]]
    pos = ax.imshow(heatmap.T,
                    extent=extent,
                    origin='lower',
                    cmap='Greys',
                    aspect='auto')
    fig.colorbar(pos)
    ax.set_xlabel('distance from mean alignment')
    ax.set_ylabel('distance from mean performance')
    fig.savefig(D.fmt_path('plotsdir/domain_alignment/align_vs_perf.png'))
    plt.close(fig)
Example #3
0
def ChangePrefix(node, newp, oldp=24):
    results = defaultdict(set)
    d = DataGetter()
    for k, v in node.results.iteritems():
        v = d.int2ip(v, oldp)
        v = d.ip2int(v, newp)
        results[k] = v
    node['results'] = dict(results)
    return node
def plot_geo_centers():
    D = DataGetter()
    world = gp.read_file(gp.datasets.get_path('naturalearth_lowres'))
    world = world[(world.pop_est > 0) & (world.name != "Antarctica")]
    with open(D.fmt_path('datadir/geo_vs_perf/raw.json'), 'r') as f:
        geometry, sizes = zip(
            *[(Point(z['geo_center_loc'][1], z['geo_center_loc'][0]),
               len(z['raw_data'])) for z in json.load(f)])
    fig, ax = plt.subplots(figsize=(15, 15))
    world.plot(edgecolor='gray', ax=ax)
    gdf = gp.GeoDataFrame(geometry=list(geometry))
    gdf.plot(ax=ax, markersize=5, color='#00FF00')
    fig.savefig(D.fmt_path('plotsdir/geo_centers.png'))
    plt.close(fig)
def plot_nearest_centers():
    D = DataGetter()
    with open(D.fmt_path('datadir/nearest_centers.json'), 'r') as f:
        data = json.load(f)

    cnres = list()
    dists = list()
    for item in data:
        _, _, c0, c1, d0, d1 = item
        cnres.append((1.0 - c0, 1.0 - c1))
        dists.append((d0, d1))
    x, y = zip(*cnres)
    #rng = [minlim, maxlim]
    #heatmap, x, y = np.histogram2d(x,y,bins=100, range=[rng,rng])
    #extent = [x[0], x[-1], y[0], y[-1]]
    fig, ax = plt.subplots(figsize=(4, 6))
    ax.scatter(x, y, alpha=0.1)
    #minlim = min([ax.get_xlim()[0], ax.get_ylim()[0]])
    #maxlim = max([ax.get_xlim()[1], ax.get_ylim()[1]])
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    #pos = ax.imshow(heatmap.T, extent=extent, origin='lower', cmap='Greys')
    #fig.colorbar(pos)
    ax.set_xlabel('CNRE with default center')
    ax.set_ylabel('CNRE with closest center')
    ax.plot([0, 1], [0, 1], 'r')
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    fig.savefig(D.fmt_path('plotsdir/nearest_centers_cnre.png'))
    plt.close(fig)
    x, y = zip(*dists)
    fig, ax = plt.subplots(figsize=(4.5, 4.5))
    ax.scatter(x, y, alpha=0.1)
    ax.set_xlabel('km to default center')
    ax.set_ylabel('km to closest center')
    ax.set_xscale('log')
    ax.set_yscale('log')
    minlim = min([ax.get_xlim()[0], ax.get_ylim()[0]])
    maxlim = max([ax.get_xlim()[1], ax.get_ylim()[1]])
    ax.set_xlim([minlim, maxlim])
    ax.set_ylim([minlim, maxlim])
    ax.plot([minlim, maxlim], [minlim, maxlim], 'r')
    fig.savefig(D.fmt_path('plotsdir/nearest_centers_dist.png'))
    plt.close(fig)
Example #6
0
def dump_domain_error(data):
    out = defaultdict(list)
    for dom, val in data:
        out[dom].append(val)
    D = DataGetter()
    if not os.path.exists(D.datadir + 'domain_error/'):
        os.makedirs(D.datadir + 'domain_error/')

    for dom in out:
        with open(D.datadir + 'domain_error/' + str(dom) + '.json', 'a') as f:
            f.write(json.dumps(out[dom]) + '\n')
Example #7
0
 def get_addr_differences(self, cluster):
     '''
     set of differing answers from a cluster and determine rarity outside of cluster
     TODO: make 'flat_answer_counts.pkl', which groups by addr instead of by (site,addr)
     '''
     answers = defaultdict(set)
     for i in cluster:
         for site, addrs in self.scb.nodes[i].results:
             for addr in addrs:
                 answers[(site, addr)].add(i)
     D = DataGetter()
     with open(D.fmt_path('datadir/pkls/flat_answer_counts.pkl'), 'r') as f:
         global_counts = pkl.load(f)
     uniques = defaultdict(list)
     for site, addr in answers:
         world = float(global_counts[(site, addr)])
         coverage = float(len(answers[(site, addr)])) / world
         if coverage >= 0.90:
             uniques[site].append(addr)
     return uniques
Example #8
0
    def plot_domain_error(self, *allstats):
        D = DataGetter()
        with open(D.fmt_path('datadir/domain_error/summary.json'), 'r') as f:
            stats = json.load(f)
        mean_errs = list()
        answers = list()
        stds = list()
        tups = list()
        for dom in stats:
            mean_errs.append(stats[dom]['abs_mean'])
            stds.append(stats[dom]['std'])
            answers.append(stats[dom]['diversity'])

        stds = [50.0 * float(z) / float(max(stds)) for z in stds]
        fig, (ax) = plt.subplots(1, 1, figsize=(6, 6))
        ax.scatter(mean_errs, answers, stds, alpha=0.3)
        ax.set_xlabel('mean domain error')
        ax.set_ylabel('# distinct answers from domain')
        ax.set_yscale('log')
        fig.savefig(
            self.fmt_path('plotsdir/domain_error/' + self.timeid + '.png'))
        plt.close(fig)
        print(self.fmt_path('plotsdir/domain_error/' + self.timeid + '.png'))
Example #9
0
 def condense_domain_error(self):
     D = DataGetter(prefix=self.prefix)
     data = dict()
     for dom in D.test_counts.keys():
         uncondensed = list()
         with open(D.datadir + 'domain_error/' + str(dom) + '.json',
                   'r') as f:
             for line in f:
                 uncondensed += json.loads(line)
         if len(uncondensed):
             condensed = {
                 'std': np.std(uncondensed),
                 'raw_mean': np.mean(uncondensed),
                 'raw_median': np.median(uncondensed),
                 'diversity': D.diversity(dom)
             }
             uncondensed = [abs(z) for z in uncondensed]
             condensed['abs_mean'] = np.mean(uncondensed)
             condensed['abs_median'] = np.median(uncondensed)
             condensed['25'] = np.percentile(uncondensed, 25)
             condensed['75'] = np.percentile(uncondensed, 75)
             data[dom] = condensed
     with open(D.fmt_path('datadir/domain_error/summary.json'), 'w') as f:
         json.dump(data, f)
Example #10
0
def plot_cnre_country_map():
    D = DataGetter()
    world = gp.read_file(gp.datasets.get_path('naturalearth_lowres'))
    world = world[(world.pop_est > 0) & (world.name != "Antarctica")]
    names = world.name.tolist()
    pos = {names[i]: i for i in range(len(names))}
    pos['United States'] = 4
    pos['Iran, Islamic Republic of'] = 107
    weights = defaultdict(set)
    conversions = dict()
    with open(D.fmt_path('datadir/countries_codes.csv'), 'r') as f:
        for line in f:
            pieces = [z.replace('"', '') for z in line.split('",')]
            try:
                conversions[pieces[1].strip()] = pieces[0].strip()
            except:
                pass
    data = list()
    with open(D.fmt_path('datadir/closeness_vs_category/data.json'), 'r') as f:
        for line in f:
            d = json.loads(line)
            data += [z for z in d if z['c'] == 'country']
    for item in data:
        weights[item['l']] = item['df']
    with open(D.fmt_path('datadir/country_cnre_distance_map.json'), 'w') as f:
        json.dump(weights, f)
    vals = list()
    covered = {
        pos[conversions[z]]: weights[z]
        for z in weights if z and z in conversions and conversions[z] in pos
    }
    nones = list()
    for i in range(len(names)):
        if i in covered:
            vals.append(covered[i])
        else:
            vals.append(min(covered.values()))
            nones.append(i)
    print(sorted(vals))
    world['cnre'] = vals
    fig, ax = plt.subplots(figsize=(6, 4))
    world.plot(column='cnre',
               cmap='cool',
               edgecolor='gray',
               ax=ax,
               norm=colors.Normalize(vmin=min(vals), vmax=max(vals)))
    for name in nones:
        if name > 158:
            name += 1
        plotCountryPatch(ax, name, 'white', world)
    ax.set_xticks([], [])
    ax.set_yticks([], [])
    sm = plt.cm.ScalarMappable(cmap='cool',
                               norm=colors.Normalize(vmin=min(vals),
                                                     vmax=max(vals)))
    sm._A = []
    cax = fig.add_axes([0.08, 0.15, 0.82, 0.05])
    cbar = fig.colorbar(sm, cax=cax, orientation='horizontal')
    cax.set_xlabel('mean external CNRE')
    fig.savefig(D.fmt_path('plotsdir/cnre_country_uniqueness_map.png'))
    plt.close(fig)
Example #11
0
def dump_homogeneity_and_completeness(data):
    D = DataGetter()
    with open(
            D.fmt_path('datadir/homogeneity_and_completeness/' +
                       data['category'] + '.json'), 'a') as f:
        f.write(json.dumps(data) + '\n')
Example #12
0
def dump_geo_vs_cnre(data):
    D = DataGetter()
    with open(D.fmt_path('datadir/vs_cnre/geodists'), 'a') as f:
        for res in data:
            f.write(json.dumps(res) + '\n')