def plot_homogeneity_and_completeness2(category): print(category) D = DataGetter() data = pandas.read_json( D.fmt_path('datadir/homogeneity_and_completeness/' + category + '.json'), lines=True) data = data.sort_values(by=['threshold']) with open( D.fmt_path('datadir/homogeneity_and_completeness/labelsets/' + category + '.json'), 'r') as f: labelset = json.load(f) x = data.threshold.to_list() y1 = data.homogeneity.to_list() y2 = data.completeness.to_list() y3 = data.nclusters.to_list() fig, (ax) = plt.subplots(1, 1) ax2 = ax.twinx() l1 = ax.plot(x, y1, '--', label='homogeneity') l2 = ax.plot(x, y2, label='completeness') l3 = ax2.plot(x, y3, 'r:', label='# clusters') ax2.axhline(len(labelset), color='k') #vind, _ = min([(i,abs(n-len(labelset))) for i,n in enumerate(y3)], key=lambda z: z[1]) #ax.axvline(x[vind], color='green') #ax.axvline(1.0-0.73, color='green') lines = l1 + l2 + l3 ax2.set_yscale('log') ax.set_xlabel('distance threshold') ax.set_ylabel('Homog. & Compl.') ax2.set_ylabel('# clusters', rotation=270) ax.set_ylim([0, 1]) fig.savefig( D.fmt_path('plotsdir/homogeneity_and_completeness_presentation/' + category + '2.png'))
def plot_domain_alignment(): ''' all_sets = list() pool = Pool(6) for tmp_counts in pool.imap_unordered(get_domain_alignment, range(len(g_clusters))): if tmp_counts: all_sets.append(tmp_counts) try: with open(g_ca.fmt_path('datadir/domain_alignment/raw.json'),'w') as f: json.dump(all_sets,f) except: print('failed to save raw') data = list() means = dict() for i, cluster in enumerate(all_sets): alns, sizes, perfs = zip(*cluster.values()) mean_aln = np.mean(alns) perfs = [z for z in perfs if z > 0] if perfs: mean_perf = np.mean(perfs) else: mean_perf = None means[i] = (mean_aln, mean_perf) for dom, val in cluster.items(): aln, s, perf = val data.append((dom, aln - mean_aln, perf - mean_perf if mean_perf and perf else None)) with open(g_ca.fmt_path('datadir/domain_alignment/deviations.json'),'w') as f: json.dump(data,f) ''' D = DataGetter() #with open(D.fmt_path('datadir/domain_alignment/deviations.json'),'r') as f: with open(D.fmt_path('datadir/deviations.json'), 'r') as f: data = json.load(f) doms, aln_devs, perf_devs = zip(*data) fig, ax = plt.subplots(figsize=(6, 3.5)) ecdf = ECDF(aln_devs) ax.plot(list(ecdf.x), list(ecdf.y)) ax.set_xlabel('distance from mean alignment') ax.set_ylabel('CDF') fig.savefig(D.fmt_path('plotsdir/domain_alignment/alignment.png')) plt.close(fig) fig, ax = plt.subplots(figsize=(4.5, 4.5)) aln_devs, perf_devs = zip( *[z for z in zip(aln_devs, perf_devs) if z[1] is not None]) heatmap, x, y = np.histogram2d(aln_devs, perf_devs, bins=50) extent = [x[0], x[-1], y[0], y[-1]] pos = ax.imshow(heatmap.T, extent=extent, origin='lower', cmap='Greys', aspect='auto') fig.colorbar(pos) ax.set_xlabel('distance from mean alignment') ax.set_ylabel('distance from mean performance') fig.savefig(D.fmt_path('plotsdir/domain_alignment/align_vs_perf.png')) plt.close(fig)
def plot_geo_centers(): D = DataGetter() world = gp.read_file(gp.datasets.get_path('naturalearth_lowres')) world = world[(world.pop_est > 0) & (world.name != "Antarctica")] with open(D.fmt_path('datadir/geo_vs_perf/raw.json'), 'r') as f: geometry, sizes = zip( *[(Point(z['geo_center_loc'][1], z['geo_center_loc'][0]), len(z['raw_data'])) for z in json.load(f)]) fig, ax = plt.subplots(figsize=(15, 15)) world.plot(edgecolor='gray', ax=ax) gdf = gp.GeoDataFrame(geometry=list(geometry)) gdf.plot(ax=ax, markersize=5, color='#00FF00') fig.savefig(D.fmt_path('plotsdir/geo_centers.png')) plt.close(fig)
def plot_nearest_centers(): D = DataGetter() with open(D.fmt_path('datadir/nearest_centers.json'), 'r') as f: data = json.load(f) cnres = list() dists = list() for item in data: _, _, c0, c1, d0, d1 = item cnres.append((1.0 - c0, 1.0 - c1)) dists.append((d0, d1)) x, y = zip(*cnres) #rng = [minlim, maxlim] #heatmap, x, y = np.histogram2d(x,y,bins=100, range=[rng,rng]) #extent = [x[0], x[-1], y[0], y[-1]] fig, ax = plt.subplots(figsize=(4, 6)) ax.scatter(x, y, alpha=0.1) #minlim = min([ax.get_xlim()[0], ax.get_ylim()[0]]) #maxlim = max([ax.get_xlim()[1], ax.get_ylim()[1]]) xlim = ax.get_xlim() ylim = ax.get_ylim() #pos = ax.imshow(heatmap.T, extent=extent, origin='lower', cmap='Greys') #fig.colorbar(pos) ax.set_xlabel('CNRE with default center') ax.set_ylabel('CNRE with closest center') ax.plot([0, 1], [0, 1], 'r') ax.set_xlim(xlim) ax.set_ylim(ylim) fig.savefig(D.fmt_path('plotsdir/nearest_centers_cnre.png')) plt.close(fig) x, y = zip(*dists) fig, ax = plt.subplots(figsize=(4.5, 4.5)) ax.scatter(x, y, alpha=0.1) ax.set_xlabel('km to default center') ax.set_ylabel('km to closest center') ax.set_xscale('log') ax.set_yscale('log') minlim = min([ax.get_xlim()[0], ax.get_ylim()[0]]) maxlim = max([ax.get_xlim()[1], ax.get_ylim()[1]]) ax.set_xlim([minlim, maxlim]) ax.set_ylim([minlim, maxlim]) ax.plot([minlim, maxlim], [minlim, maxlim], 'r') fig.savefig(D.fmt_path('plotsdir/nearest_centers_dist.png')) plt.close(fig)
def get_addr_differences(self, cluster): ''' set of differing answers from a cluster and determine rarity outside of cluster TODO: make 'flat_answer_counts.pkl', which groups by addr instead of by (site,addr) ''' answers = defaultdict(set) for i in cluster: for site, addrs in self.scb.nodes[i].results: for addr in addrs: answers[(site, addr)].add(i) D = DataGetter() with open(D.fmt_path('datadir/pkls/flat_answer_counts.pkl'), 'r') as f: global_counts = pkl.load(f) uniques = defaultdict(list) for site, addr in answers: world = float(global_counts[(site, addr)]) coverage = float(len(answers[(site, addr)])) / world if coverage >= 0.90: uniques[site].append(addr) return uniques
def plot_domain_error(self, *allstats): D = DataGetter() with open(D.fmt_path('datadir/domain_error/summary.json'), 'r') as f: stats = json.load(f) mean_errs = list() answers = list() stds = list() tups = list() for dom in stats: mean_errs.append(stats[dom]['abs_mean']) stds.append(stats[dom]['std']) answers.append(stats[dom]['diversity']) stds = [50.0 * float(z) / float(max(stds)) for z in stds] fig, (ax) = plt.subplots(1, 1, figsize=(6, 6)) ax.scatter(mean_errs, answers, stds, alpha=0.3) ax.set_xlabel('mean domain error') ax.set_ylabel('# distinct answers from domain') ax.set_yscale('log') fig.savefig( self.fmt_path('plotsdir/domain_error/' + self.timeid + '.png')) plt.close(fig) print(self.fmt_path('plotsdir/domain_error/' + self.timeid + '.png'))
def condense_domain_error(self): D = DataGetter(prefix=self.prefix) data = dict() for dom in D.test_counts.keys(): uncondensed = list() with open(D.datadir + 'domain_error/' + str(dom) + '.json', 'r') as f: for line in f: uncondensed += json.loads(line) if len(uncondensed): condensed = { 'std': np.std(uncondensed), 'raw_mean': np.mean(uncondensed), 'raw_median': np.median(uncondensed), 'diversity': D.diversity(dom) } uncondensed = [abs(z) for z in uncondensed] condensed['abs_mean'] = np.mean(uncondensed) condensed['abs_median'] = np.median(uncondensed) condensed['25'] = np.percentile(uncondensed, 25) condensed['75'] = np.percentile(uncondensed, 75) data[dom] = condensed with open(D.fmt_path('datadir/domain_error/summary.json'), 'w') as f: json.dump(data, f)
def plot_cnre_country_map(): D = DataGetter() world = gp.read_file(gp.datasets.get_path('naturalearth_lowres')) world = world[(world.pop_est > 0) & (world.name != "Antarctica")] names = world.name.tolist() pos = {names[i]: i for i in range(len(names))} pos['United States'] = 4 pos['Iran, Islamic Republic of'] = 107 weights = defaultdict(set) conversions = dict() with open(D.fmt_path('datadir/countries_codes.csv'), 'r') as f: for line in f: pieces = [z.replace('"', '') for z in line.split('",')] try: conversions[pieces[1].strip()] = pieces[0].strip() except: pass data = list() with open(D.fmt_path('datadir/closeness_vs_category/data.json'), 'r') as f: for line in f: d = json.loads(line) data += [z for z in d if z['c'] == 'country'] for item in data: weights[item['l']] = item['df'] with open(D.fmt_path('datadir/country_cnre_distance_map.json'), 'w') as f: json.dump(weights, f) vals = list() covered = { pos[conversions[z]]: weights[z] for z in weights if z and z in conversions and conversions[z] in pos } nones = list() for i in range(len(names)): if i in covered: vals.append(covered[i]) else: vals.append(min(covered.values())) nones.append(i) print(sorted(vals)) world['cnre'] = vals fig, ax = plt.subplots(figsize=(6, 4)) world.plot(column='cnre', cmap='cool', edgecolor='gray', ax=ax, norm=colors.Normalize(vmin=min(vals), vmax=max(vals))) for name in nones: if name > 158: name += 1 plotCountryPatch(ax, name, 'white', world) ax.set_xticks([], []) ax.set_yticks([], []) sm = plt.cm.ScalarMappable(cmap='cool', norm=colors.Normalize(vmin=min(vals), vmax=max(vals))) sm._A = [] cax = fig.add_axes([0.08, 0.15, 0.82, 0.05]) cbar = fig.colorbar(sm, cax=cax, orientation='horizontal') cax.set_xlabel('mean external CNRE') fig.savefig(D.fmt_path('plotsdir/cnre_country_uniqueness_map.png')) plt.close(fig)
def dump_homogeneity_and_completeness(data): D = DataGetter() with open( D.fmt_path('datadir/homogeneity_and_completeness/' + data['category'] + '.json'), 'a') as f: f.write(json.dumps(data) + '\n')
def dump_geo_vs_cnre(data): D = DataGetter() with open(D.fmt_path('datadir/vs_cnre/geodists'), 'a') as f: for res in data: f.write(json.dumps(res) + '\n')