def main(): swire_names, swire_coords, _ = pipeline.generate_swire_features( overwrite=False) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets( swire_coords, swire_labels, overwrite=False) cids = list( pipeline.cross_identify_all(swire_names, swire_coords, swire_labels, swire_test_sets, swire_labels[:, 0], field='cdfs')) table = astropy.io.ascii.read(pipeline.TABLE_PATH) atlas_to_swire_norris = {} key_to_atlas = {} for row in table: name = row['Component Name (Franzen)'] key_to_atlas[row['Key']] = name swire = row['Source SWIRE (Norris)'] if not swire or not swire.startswith('SWIRE') or not name: continue atlas_to_swire_norris[name] = swire print( 'Labeller\tClassifier\tQuadrant\tDataset\tn_correct\tn_total\tn_skipped\tAccuracy' ) for cid in cids: atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) # For each ATLAS object in RGZ & Norris... atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'], cid.quadrant].nonzero()[0] n_total = 0 n_correct = 0 n_skipped = 0 for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_norris: n_skipped += 1 continue if name not in atlas_to_swire_predictor: n_skipped += 1 continue swire_norris = atlas_to_swire_norris[name] swire_predictor = atlas_to_swire_predictor[name] n_correct += swire_norris == swire_predictor n_total += 1 print(cid.labeller, cid.classifier, cid.quadrant, '{:<20}'.format(cid.dataset_name), n_correct, n_total, n_skipped, '{:.02%}'.format(n_correct / n_total), sep='\t')
def main(): swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False) cids = list(pipeline.cross_identify_all(swire_names, swire_coords, swire_labels, swire_test_sets, swire_labels[:, 0], field='cdfs')) table = astropy.io.ascii.read(pipeline.TABLE_PATH) atlas_to_swire_norris = {} key_to_atlas = {} for row in table: name = row['Component Name (Franzen)'] key_to_atlas[row['Key']] = name swire = row['Source SWIRE (Norris)'] if not swire or not swire.startswith('SWIRE') or not name: continue atlas_to_swire_norris[name] = swire print('Labeller\tClassifier\tQuadrant\tDataset\tn_correct\tn_total\tn_skipped\tAccuracy') for cid in cids: atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) # For each ATLAS object in RGZ & Norris... atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'], cid.quadrant].nonzero()[0] n_total = 0 n_correct = 0 n_skipped = 0 for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_norris: n_skipped += 1 continue if name not in atlas_to_swire_predictor: n_skipped += 1 continue swire_norris = atlas_to_swire_norris[name] swire_predictor = atlas_to_swire_predictor[name] n_correct += swire_norris == swire_predictor n_total += 1 print(cid.labeller, cid.classifier, cid.quadrant, '{:<20}'.format(cid.dataset_name), n_correct, n_total, n_skipped, '{:.02%}'.format(n_correct / n_total), sep='\t')
def print_table(field='cdfs'): titlemap = { 'RGZ & Norris & compact': 'Compact', 'RGZ & Norris & resolved': 'Resolved', 'RGZ & Norris': 'All', 'RGZ & compact': 'Compact', 'RGZ & resolved': 'Resolved', 'RGZ': 'All', } lr_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'LogisticRegression_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'LogisticRegression_rgz_{}_predictions'.format(field))) rf_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'RandomForestClassifier_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'RandomForestClassifier_rgz_{}_predictions'.format(field))) cnn_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'CNN_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'CNN_rgz_{}_predictions'.format(field))) swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field=field) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field) _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field=field) swire_names = numpy.array(swire_names) swire_coords = numpy.array(swire_coords) predictions_map = collections.defaultdict(dict) # SWIRE -> predictor -> probability swire_coords_map = {} swire_expert_map = {} swire_rgz_map = {} known_predictors = set() for classifier, predictions_ in [['LR', lr_predictions], ['CNN', cnn_predictions], ['RF', rf_predictions]]: for predictions in predictions_: dataset_name = predictions.dataset_name labeller = predictions.labeller if labeller == 'rgz' and 'Norris' in dataset_name: labeller = 'RGZ N' continue labeller = labeller.title() if labeller == 'norris' else labeller.upper() predictor_name = '{}({} / {})'.format(classifier, labeller, titlemap[dataset_name]) if field == 'cdfs': swire_names_ = swire_names[swire_test_sets[:, pipeline.SET_NAMES['RGZ'], predictions.quadrant]] swire_coords_ = swire_coords[swire_test_sets[:, pipeline.SET_NAMES['RGZ'], predictions.quadrant]] swire_labels_ = swire_labels[swire_test_sets[:, pipeline.SET_NAMES['RGZ'], predictions.quadrant]] else: swire_names_ = swire_names[swire_test_sets[:, 0, 0]] swire_coords_ = swire_coords[swire_test_sets[:, 0, 0]] swire_labels_ = swire_labels[swire_test_sets[:, 0, 0]] assert predictions.probabilities.shape[0] == len(swire_names_), \ 'expected {}, got {}'.format(predictions.probabilities.shape[0], len(swire_names_)) for name, coords, prediction, label in zip(swire_names_, swire_coords_, predictions.probabilities, swire_labels_): predictions_map[name][predictor_name] = prediction swire_coords_map[name] = coords swire_expert_map[name] = label[0] swire_rgz_map[name] = label[1] known_predictors.add(predictor_name) known_predictors = sorted(known_predictors) swires = sorted(predictions_map) ras = [] decs = [] is_expert_host = [] is_rgz_host = [] predictor_columns = collections.defaultdict(list) for swire in swires: for predictor in known_predictors: predictor_columns[predictor].append(predictions_map[swire].get(predictor, '')) ras.append(swire_coords_map[swire][0]) decs.append(swire_coords_map[swire][1]) is_expert_host.append(['no', 'yes'][swire_expert_map[swire]]) is_rgz_host.append(['no', 'yes'][swire_rgz_map[swire]]) table = astropy.table.Table( data=[swires, ras, decs, is_expert_host, is_rgz_host] + [predictor_columns[p] for p in known_predictors], names=['SWIRE', 'RA', 'Dec', 'Expert host', 'RGZ host'] + known_predictors) table.write('/Users/alger/data/Crowdastro/predicted_swire_table_{}_21_03_18.csv'.format(field), format='csv') for p in known_predictors: table[p].format = '{:.4f}' table.write('/Users/alger/data/Crowdastro/predicted_swire_table_{}_21_03_18.tex'.format(field), format='latex')
def main(examples=None, classifier='CNN', labeller='Norris'): # Load SWIRE stuff. swire_names, swire_coords, swire_features = pipeline.generate_swire_features(overwrite=False) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False) _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, overwrite=False) swire_tree = KDTree(swire_coords) swire_name_to_index = {n: i for i, n in enumerate(swire_names)} # Load ATLAS coords. table = astropy.io.ascii.read(pipeline.TABLE_PATH) atlas_to_coords = {} atlas_to_swire_coords = {} for row in table: name = row['Component Name (Franzen)'] if not name: continue atlas_to_coords[name] = row['Component RA (Franzen)'], row['Component DEC (Franzen)'] index = swire_name_to_index.get(row['Source SWIRE (Norris)'] or '') if index: atlas_to_swire_coords[name] = swire_coords[index] ir_stretch = astropy.visualization.LogStretch(0.001) if examples is None: examples = examples_incorrect.get_examples() examples = examples[labeller, classifier, 'All'] for example in examples: print('Plotting {}'.format(example)) predictor_name = '{}_{}'.format(classifier, labeller) cid = example[2] # Load FITS stuff. try: radio_fits = astropy.io.fits.open(CDFS_PATH + cid + '_radio.fits') except FileNotFoundError: if example[1]: # Has Zooniverse ID print('{} not in RGZ'.format(cid)) continue ir_fits = astropy.io.fits.open(CDFS_PATH + cid + '_ir.fits') wcs = astropy.wcs.WCS(radio_fits[0].header) # Compute info for contour levels. (also from Enno Middelberg) median = numpy.median(radio_fits[0].data) mad = numpy.median(numpy.abs(radio_fits[0].data - median)) sigma = mad / mad2sigma # Set up the plot. fig = plt.figure() ax = astropy.visualization.wcsaxes.WCSAxes( fig, [0.1, 0.1, 0.8, 0.8], wcs=wcs) fig.add_axes(ax) ax.set_title('{}'.format(example[0], example[1])) # Show the infrared. ax.imshow(ir_stretch(ir_fits[0].data), cmap='cubehelix_r', origin='lower') # Show the radio. ax.contour(radio_fits[0].data, colors='black', levels=[nsig * sigma * sigmult ** i for i in range(15)], linewidths=1, origin='lower', zorder=1) # Plot predictions. predictions = get_predictions(swire_tree, swire_coords, swire_test_sets, atlas_to_coords[example[0]], predictor_name) if not predictions: print('No predictions for {}'.format(example[0])) continue coords = [p[0] for p in predictions] probabilities = [p[1] for p in predictions] coords = wcs.all_world2pix(coords, 1) ax.scatter(coords[:, 0], coords[:, 1], s=numpy.sqrt(numpy.array(probabilities)) * 200, color='white', edgecolor='black', linewidth=1, alpha=0.9, marker='o', zorder=2) choice = numpy.argmax(probabilities) ax.scatter(coords[choice, 0], coords[choice, 1], s=200 / numpy.sqrt(2), color='blue', marker='x', zorder=2.5) try: norris_coords, = wcs.all_world2pix([atlas_to_swire_coords[example[0]]], 1) except KeyError: print('No Norris cross-identification for {}'.format(example[0])) continue ax.scatter(norris_coords[0], norris_coords[1], marker='+', s=200, zorder=3, color='green') lon, lat = ax.coords lon.set_major_formatter('hh:mm:ss') lon.set_axislabel('Right Ascension') lat.set_axislabel('Declination') fn = '{}_{}_{}'.format(classifier, labeller, example[0]) plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/examples/' + fn + '.png', bbox_inches='tight', pad_inches=0) plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/examples/' + fn + '.pdf', bbox_inches='tight', pad_inches=0) plt.clf()
def plot_predictions(cut=0.95, labeller='norris', dataset_name=None, classifier=None): """Plot colour-colour diagram for predicted host galaxies. labeller in {'norris', 'rgz'} dataset_name in {'RGZ & Norris', ...} """ with h5py.File(CROWDASTRO_PATH, 'r') as f: swire_numeric_cdfs = f['/swire/cdfs/numeric'][:, 2:2 + 4] f_36 = swire_numeric_cdfs[:, 0] f_45 = swire_numeric_cdfs[:, 1] f_58 = swire_numeric_cdfs[:, 2] f_80 = swire_numeric_cdfs[:, 3] detection_58 = (f_58 != -99) detection_80 = (f_80 != -99) p = pipeline.unserialise_predictions( pipeline.WORKING_DIR + '{}_{}_cdfs_predictions'.format(classifier, labeller)) predictions = {} for i in p: predictions[i.dataset_name, i.quadrant] = i swire_names, swire_coords, _ = pipeline.generate_swire_features( overwrite=False, field='cdfs') swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field='cdfs') _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field='cdfs') xs = [] ys = [] colours = [] for q in range(4): swire_set = swire_test_sets[:, pipeline.SET_NAMES['RGZ'], q] if labeller == 'norris' and not dataset_name: # predictions_set = predictions['RGZ & Norris', q].probabilities > cut f_36_ = f_36[swire_set & swire_labels[:, 0]] #[predictions_set] f_45_ = f_45[swire_set & swire_labels[:, 0]] #[predictions_set] f_58_ = f_58[swire_set & swire_labels[:, 0]] #[predictions_set] f_80_ = f_80[swire_set & swire_labels[:, 0]] #[predictions_set] elif labeller == 'rgz' and not dataset_name: f_36_ = f_36[swire_set & swire_labels[:, 1]] f_45_ = f_45[swire_set & swire_labels[:, 1]] f_58_ = f_58[swire_set & swire_labels[:, 1]] f_80_ = f_80[swire_set & swire_labels[:, 1]] if labeller == 'norris' and dataset_name: predictions_set = predictions[dataset_name, q].probabilities > cut f_36_ = f_36[swire_set][predictions_set] f_45_ = f_45[swire_set][predictions_set] f_58_ = f_58[swire_set][predictions_set] f_80_ = f_80[swire_set][predictions_set] probabilities = predictions[dataset_name, q].probabilities[predictions_set] detection_58_ = (f_58_ != -99) detection_80_ = (f_80_ != -99) detection_all_ = detection_58_ & detection_80_ ratio_58_36 = numpy.log10(f_58_[detection_all_] / f_36_[detection_all_]) ratio_80_45 = numpy.log10(f_80_[detection_all_] / f_45_[detection_all_]) probabilities = probabilities[detection_all_] xs.extend(ratio_58_36) ys.extend(ratio_80_45) colours.extend(probabilities) assert len(xs) == len(ys) assert len(xs) == len(colours) plot_basic() if dataset_name: plt.scatter(xs, ys, s=20, marker='^', linewidth=0, alpha=0.5, c=numpy.array(colours), cmap='winter') else: plt.scatter(xs, ys, s=25, c='r', marker='^', linewidth=0) plt.xlim((-0.75, 1.0)) plt.ylim((-0.75, 1.0)) plt.xlabel('$\\log_{10}(S_{5.8}/S_{3.6})$') plt.ylabel('$\\log_{10}(S_{8.0}/S_{4.5})$') plt.subplots_adjust(left=0.2, bottom=0.15, right=0.95, top=0.95) plt.colorbar() plt.show()
def plot_grid(field='cdfs'): # Load predictions. lr_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'LogisticRegression_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'LogisticRegression_rgz_{}_predictions'.format(field))) rf_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'RandomForestClassifier_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'RandomForestClassifier_rgz_{}_predictions'.format(field))) cnn_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'CNN_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'CNN_rgz_{}_predictions'.format(field))) # Convert to the format we need. e.g. {'RGZ' -> [acc, acc, acc, acc]} lr_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} lr_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} rf_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} rf_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} cnn_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} cnn_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} for predictions in lr_predictions: dataset_name = predictions.dataset_name if predictions.labeller == 'norris': lr_norris_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy else: lr_rgz_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy for predictions in rf_predictions: dataset_name = predictions.dataset_name if predictions.labeller == 'norris': rf_norris_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy else: rf_rgz_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy for predictions in cnn_predictions: dataset_name = predictions.dataset_name if predictions.labeller == 'norris': cnn_norris_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy else: cnn_rgz_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy if field == 'cdfs': # Load RGZ cross-identifications and compute a balanced accuracy with them. swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field=field) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field=field) label_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} label_norris_accuracies = {sstr: [1] * 4 for sstr in pipeline.SET_NAMES} # By definition. for dataset_name in pipeline.SET_NAMES: for quadrant in range(4): test_set = swire_test_sets[:, pipeline.SET_NAMES[dataset_name], quadrant] predictions = swire_labels[test_set, 1] trues = swire_labels[test_set, 0] ba = balanced_accuracy(trues, predictions) label_rgz_accuracies[dataset_name][quadrant] = ba colours = ['grey', 'magenta', 'blue', 'orange'] markers = ['o', '^', 'x', 's'] handles = {} plt.figure(figsize=(5, 5)) accuracy_map = defaultdict(lambda: defaultdict(dict)) # For table output. output_sets = [ ('LR', [lr_norris_accuracies, lr_rgz_accuracies]), ('CNN', [cnn_norris_accuracies, cnn_rgz_accuracies]), ('RF', [rf_norris_accuracies, rf_rgz_accuracies]), ] if field == 'cdfs': output_sets.append(('Labels', [label_norris_accuracies, label_rgz_accuracies])) for j, (classifier_name, classifier_set) in enumerate(output_sets): for i, set_name in enumerate(norris_labelled_sets): if 'compact' not in set_name: # Skip compact. ax = plt.subplot(2, 1, {'RGZ & Norris & resolved': 1, 'RGZ & Norris': 2}[set_name]) ax.set_ylim((80, 100)) ax.set_xlim((-0.5, 1.5)) ax.set_xticks([0, 1])#, 2]) ax.set_xticklabels(['Norris', # 'RGZ N', 'RGZ', ], rotation='horizontal') if i == 2: plt.xlabel('Labels') plt.ylabel('{}\nBalanced accuracy\n(per cent)'.format(titlemap[set_name])) ax.title.set_fontsize(16) ax.xaxis.label.set_fontsize(12) ax.yaxis.label.set_fontsize(9) for tick in ax.get_xticklabels() + ax.get_yticklabels(): tick.set_fontsize(10) ax.grid(which='major', axis='y', color='#EEEEEE') for k in range(4): if 'compact' in set_name: continue if j != 3: # !Labels ax.scatter([0 + (j - 1) / 5], classifier_set[0][set_name][k] * 100, color=colours[j], marker=markers[j], linewidth=1, edgecolor='k') rgz_offset = ((j - 1.5) / 6) if field == 'cdfs' else (j - 1) / 5 handles[j] = ax.scatter([1 + rgz_offset], classifier_set[1][fullmap[set_name]][k] * 100, color=colours[j], marker=markers[j], linewidth=1, edgecolor='k') # ax.scatter([1 + (j - 1) / 5], classifier_set[1][set_name][k] * 100, # color=colours[j], marker=markers[j], linewidth=1, edgecolor='k') # Compute for table. for labeller in ['Norris', 'RGZ N', 'RGZ']: if labeller == 'Norris': mean = numpy.mean(classifier_set[0][set_name]) * 100 stdev = numpy.std(classifier_set[0][set_name]) * 100 elif labeller == 'RGZ N': continue # mean = numpy.mean(classifier_set[1][set_name]) * 100 # stdev = numpy.std(classifier_set[1][set_name]) * 100 elif labeller == 'RGZ': mean = numpy.mean(classifier_set[1][fullmap[set_name]]) * 100 stdev = numpy.std(classifier_set[1][fullmap[set_name]]) * 100 accuracy_map[labeller][classifier_name][titlemap[set_name]] = '${:.02f} \\pm {:.02f}$'.format(mean, stdev) # Assemble table. col_labeller = [] col_classifier = [] col_compact = [] col_resolved = [] col_all = [] for labeller in ['Norris', 'RGZ N', 'RGZ']: if labeller == 'RGZ N': continue for classifier in ['CNN', 'LR', 'RF'] + ['Labels'] if field == 'cdfs' else []: col_labeller.append(labeller) col_classifier.append(classifier) col_compact.append(accuracy_map[labeller][classifier]['Compact']) col_resolved.append(accuracy_map[labeller][classifier]['Resolved']) col_all.append(accuracy_map[labeller][classifier]['All']) out_table = astropy.table.Table([col_labeller, col_classifier, col_compact, col_resolved, col_all], names=['Labeller', 'Classifier', "Mean `Compact' accuracy\\\\(per cent)", "Mean `Resolved' accuracy\\\\(per cent)", "Mean `All' accuracy\\\\(per cent)"]) out_table.write('../{}_accuracy_table.tex'.format(field), format='latex') plt.figlegend([handles[j] for j in sorted(handles)], ['LR', 'CNN', 'RF'] + (['Labels'] if field == 'cdfs' else []), 'lower center', ncol=4, fontsize=10) plt.subplots_adjust(bottom=0.2, hspace=0.25) plt.savefig('../images/{}_ba_grid.pdf'.format(field), bbox_inches='tight', pad_inches=0) plt.savefig('../images/{}_ba_grid.png'.format(field), bbox_inches='tight', pad_inches=0)
def get_examples(): titlemap = { 'RGZ & Norris & compact': 'Compact', 'RGZ & Norris & resolved': 'Resolved', 'RGZ & Norris': 'All', 'RGZ & compact': 'Compact', 'RGZ & resolved': 'Resolved', 'RGZ': 'All', } swire_names, swire_coords, _ = pipeline.generate_swire_features( overwrite=False) swire_labels = pipeline.generate_swire_labels(swire_names, overwrite=False) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets( swire_coords, overwrite=False) cids = list( pipeline.cross_identify_all(swire_names, swire_coords, swire_test_sets, swire_labels[:, 0])) table = astropy.io.ascii.read(pipeline.TABLE_PATH) atlas_to_swire_norris = {} key_to_atlas = {} atlas_to_ras = {} atlas_to_decs = {} id_to_atlas = {} atlas_to_zid = {} atlas_to_id = {} for row in table: name = row['Component Name (Franzen)'] if not name: continue id_to_atlas[row['Component ID (Franzen)']] = name key_to_atlas[row['Key']] = name swire = row['Source SWIRE (Norris)'] atlas_to_swire_norris[name] = swire atlas_to_id[name] = row['Component ID (Franzen)'] atlas_to_ras[name] = row['Component RA (Franzen)'] atlas_to_decs[name] = row['Component DEC (Franzen)'] atlas_to_zid[name] = row['Component Zooniverse ID (RGZ)'] atlas_to_rgz = {} atlas_to_radio_consensus = {} atlas_to_ir_consensus = {} for row in astropy.io.ascii.read(pipeline.RGZ_PATH): name = id_to_atlas[row['atlas_id']] atlas_to_radio_consensus[name] = row['consensus.radio_level'] atlas_to_ir_consensus[name] = row['consensus.ir_level'] atlas_to_rgz[name] = row['SWIRE.designation'] cross_identifications = collections.defaultdict( dict) # ATLAS -> labeller -> SWIRE for cid in cids: if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name: continue atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) # For each ATLAS object in RGZ & Norris... atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'], cid.quadrant].nonzero()[0] if 'Norris' in cid.dataset_name and cid.labeller == 'rgz': labeller = 'RGZ N' elif cid.labeller == 'rgz': labeller = 'RGZ' else: labeller = 'Norris' for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_norris: continue if name not in atlas_to_swire_predictor: continue cross_identifications[name]['Norris'] = atlas_to_swire_norris[name] cross_identifications[name]['RGZ'] = atlas_to_rgz.get(name, None) cross_identifications[name][ labeller, cid.classifier, titlemap[cid.dataset_name]] = atlas_to_swire_predictor[name] """ For each classifier, pull out examples where: - RGZ and Norris agree, but the classifier disagrees. Only include RGZ & Norris dataset ("All"). """ classifier_to_example = collections.defaultdict(set) for atlas, cids_ in cross_identifications.items(): if cids_['Norris'] != cids_['RGZ']: continue for classifier, swire in cids_.items(): if classifier[2] != 'All' or classifier[1] in { 'Random', 'Groundtruth' }: continue if swire != cids_['Norris']: classifier_to_example[classifier].add( (atlas, atlas_to_zid[atlas], atlas_to_id[atlas])) return classifier_to_example
def plot(field='cdfs'): log.debug('Getting SWIRE, ATLAS features.') swire_names, swire_coords, _ = pipeline.generate_swire_features( overwrite=False, field=field) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets( swire_coords, swire_labels, overwrite=False, field=field) log.debug('Calling cross-identify.') cids = list( pipeline.cross_identify_all(swire_names, swire_coords, swire_labels, swire_test_sets, swire_labels[:, 0], field=field)) # Also load the nearest-neighbour cross-identifications. cids += [ pipeline.CrossIdentifications.from_hdf5( pipeline.WORKING_DIR + 'NearestNeighbour_{}_cross_ids_{}_RGZ & Norris.h5'.format( field, q)) for q in range(4 if field == 'cdfs' else 1) ] swire_tree = scipy.spatial.KDTree(swire_coords[swire_test_sets[:, 0, 0]]) failed_coords = [] if field == 'cdfs': table = astropy.io.ascii.read(pipeline.TABLE_PATH) rgzcat = astropy.io.ascii.read(pipeline.RGZ_PATH) atlas_to_swire_expert = {} atlas_to_swire_rgz = {} key_to_atlas = {} atlas_id_to_name = {} is_compact = {} for row in table: name = row['Component Name (Franzen)'] key_to_atlas[row['Key']] = name swire = row['Source SWIRE (Norris)'] if not swire or not swire.startswith('SWIRE') or not name: continue atlas_id_to_name[row['Component ID (Franzen)']] = name atlas_to_swire_expert[name] = swire is_compact[name] = pipeline.compact_test(row) for row in rgzcat: swire_name = row['SWIRE.designation'] if not swire_name or swire_name == '-99': continue name = atlas_id_to_name.get(row['atlas_id'], None) atlas_to_swire_rgz[name] = swire_name else: atlas_to_swire_expert = {} with astropy.io.fits.open( pipeline.MIDDELBERG_TABLE4_PATH) as elais_components_fits: elais_components = elais_components_fits[1].data atlas_cid_to_name = {} atlas_names = [] # Indices correspond to table 4 rows. atlas_name_to_compact = {} for component in elais_components: cid = component['CID'] name = component['ATELAIS'] atlas_names.append(name) atlas_cid_to_name[cid] = name row = { 'Component S (Franzen)': component['Sint'], # Fitting in with the CDFS API... 'Component S_ERR (Franzen)': component['e_Sint'], 'Component Sp (Franzen)': component['Sp'], 'Component Sp_ERR (Franzen)': component['e_Sp'] } atlas_name_to_compact[name] = pipeline.compact_test(row) with open(pipeline.MIDDELBERG_TABLE5_PATH) as elais_file: # Took this code from pipeline.py, probably should make it a function lines = [line.split('|') for line in elais_file] for line in lines: if 'ATELAISJ' not in line[0]: continue line_cids = line[1] if 'C0' not in line_cids and 'C1' not in line_cids: continue line_cids = [cid.strip() for cid in line_cids.split(',')] swire_coord_re = re.search( r'SWIRE4J(\d\d)(\d\d)(\d\d\.\d\d)(-\d\d)(\d\d)(\d\d\.\d)', line[2]) if not swire_coord_re: continue swire_coord_list = swire_coord_re.groups() coord = astropy.coordinates.SkyCoord( ra='{} {} {}'.format(*swire_coord_list[:3]), dec='{} {} {}'.format(*swire_coord_list[3:]), unit=('hourangle', 'deg')) coord = (coord.ra.deg, coord.dec.deg) # Nearest SWIRE... dist, nearest = swire_tree.query(coord) if dist > 5 / 60 / 60: logging.debug( 'No SWIRE match found for Middelberg cross-identification {}' .format(line[0])) logging.debug('Nearest is {} ({:.01f} arcsec)'.format( numpy.array(swire_names)[swire_test_sets[:, 0, 0]][nearest], dist * 60 * 60)) logging.debug('Middelberg: {}'.format( swire_coord_re.group())) failed_coords.append(coord) continue name = numpy.array(swire_names)[swire_test_sets[:, 0, 0]][nearest] for cid in line_cids: atlas_to_swire_expert[atlas_cid_to_name[cid]] = name labeller_classifier_to_accuracies = collections.defaultdict(list) # Augment the CIDs by duplicating the "resolved" cross-ids to make the "all" set. resolved_cids_copy = [ copy.copy(cid) for cid in cids if 'resolved' in cid.dataset_name ] for cid in resolved_cids_copy: cid.dataset_name = cid.dataset_name.replace(' & resolved', '') cids.extend(resolved_cids_copy) for cid in cids: if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name: continue if cid.classifier in {'Groundtruth', 'Random', 'NearestNeighbour'}: # Deal with these later as they are special. continue atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) n_total = 0 n_correct = 0 n_skipped = 0 n_compact = 0 if field == 'cdfs': atlas_keys = atlas_test_sets[:, pipeline. SET_NAMES[whatset[cid.dataset_name]], cid.quadrant].nonzero()[0] # For each ATLAS object in RGZ & Norris... for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_expert: n_skipped += 1 continue if name not in atlas_to_swire_predictor: n_skipped += 1 continue swire_norris = atlas_to_swire_expert[name] swire_predictor = atlas_to_swire_predictor[name] n_correct += swire_norris == swire_predictor n_total += 1 else: # Only one test set for ELAIS. atlas_indices = atlas_test_sets[:, 0, 0].nonzero()[0] assert atlas_test_sets.shape[0] == len(atlas_names) for index in atlas_indices: # Screen resolved here. atlas_name = atlas_names[index] if atlas_name not in atlas_to_swire_expert: n_skipped += 1 continue if atlas_name not in atlas_to_swire_predictor: n_skipped += 1 continue if 'resolved' in cid.dataset_name and atlas_name_to_compact[ atlas_name]: n_compact += 1 continue swire_middelberg = atlas_to_swire_expert[atlas_name] swire_predictor = atlas_to_swire_predictor[atlas_name] n_correct += swire_middelberg == swire_predictor n_total += 1 # print('Compact: {:.02%}'.format(n_compact / (n_total + n_compact))) if 'Norris' in cid.dataset_name and cid.labeller == 'rgz': labeller = 'RGZ N' elif cid.labeller == 'rgz': labeller = 'RGZ' else: labeller = 'Norris' labeller_classifier_to_accuracies[labeller, cid.classifier, titlemap[cid.dataset_name]].append( n_correct / n_total) # Groundtruth, random, and NN classifiers exist only for the RGZ & Norris set, but we want to test on all subsets. # This section duplicates the classifiers and evaluates them on all subsets. for cid in cids: if cid.classifier not in {'Groundtruth', 'Random', 'NearestNeighbour'}: continue for dataset_name in [ 'RGZ & Norris', 'RGZ & Norris & resolved', 'RGZ & Norris & compact' ]: atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) n_total = 0 n_correct = 0 n_skipped = 0 if field == 'cdfs': # For each ATLAS object in RGZ & Norris... atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES[dataset_name], cid.quadrant].nonzero()[0] for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_expert: n_skipped += 1 continue if name not in atlas_to_swire_predictor: n_skipped += 1 continue swire_norris = atlas_to_swire_expert[name] swire_predictor = atlas_to_swire_predictor[name] n_correct += swire_norris == swire_predictor if cid.classifier == 'NearestNeighbour' and swire_norris != swire_predictor: pass n_total += 1 else: atlas_indices = atlas_test_sets[:, 0, 0].nonzero()[0] assert atlas_test_sets.shape[0] == len(atlas_names) for index in atlas_indices: # Screen resolved here (because the test sets aren't useful for that for ELAIS) atlas_name = atlas_names[index] if 'resolved' in dataset_name and atlas_name_to_compact[ atlas_name]: continue if atlas_name not in atlas_to_swire_expert: n_skipped += 1 continue if atlas_name not in atlas_to_swire_predictor: n_skipped += 1 continue swire_middelberg = atlas_to_swire_expert[atlas_name] swire_predictor = atlas_to_swire_predictor[atlas_name] n_correct += swire_middelberg == swire_predictor n_total += 1 if 'Norris' in cid.dataset_name and cid.labeller == 'rgz': labeller = 'RGZ N' elif cid.labeller == 'rgz': labeller = 'RGZ' else: labeller = 'Norris' print(labeller, cid.classifier, titlemap[dataset_name], n_correct, n_total, n_correct / n_total) labeller_classifier_to_accuracies[labeller, cid.classifier, titlemap[dataset_name]].append( n_correct / n_total) if field == 'cdfs': # Compute accuracy for RGZ. for dataset_name in pipeline.SET_NAMES: for quadrant in range(4): # N.B. Disabled using the pipeline for RGZ. # # Compact objects are cross-identified in a separate pipeline, which is slow so I don't want to reproduce it here. # # So I'll read the compact object cross-identifications from the LR(RGZ) cross-identification set, since it ought # # to be the same. # corresponding_set, = [cid for cid in cids if cid.quadrant == quadrant # and cid.dataset_name == dataset_name # and cid.labeller == 'rgz' # and cid.classifier == 'LogisticRegression'] # atlas_to_swire_lr = dict(zip(corresponding_set.radio_names, corresponding_set.ir_names)) n_total = 0 n_correct = 0 n_skipped = 0 n_compact = 0 atlas_keys = atlas_test_sets[:, pipeline. SET_NAMES[whatset[dataset_name]], quadrant].nonzero()[0] # For each ATLAS object in RGZ & Norris... for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_expert: n_skipped += 1 continue if name not in atlas_to_swire_rgz: # or name not in atlas_to_swire_lr: n_skipped += 1 continue if False and is_compact[name]: swire_predictor = atlas_to_swire_lr[name] else: swire_predictor = atlas_to_swire_rgz[name] swire_norris = atlas_to_swire_expert[name] n_correct += swire_norris == swire_predictor n_total += 1 labeller_classifier_to_accuracies[ 'RGZ', 'Label', titlemap[dataset_name]].append(n_correct / n_total) labeller_classifier_to_accuracy = {} labeller_classifier_to_stdev = {} for key, accuracies in labeller_classifier_to_accuracies.items(): print('Best {}:'.format(key), max(accuracies)) labeller_classifier_to_accuracy[key] = numpy.mean(accuracies) labeller_classifier_to_stdev[key] = numpy.std(accuracies) random_acc = { k[2]: v * 100 for k, v in labeller_classifier_to_accuracy.items() if k[1] == 'Random' } random_stdev = { k[2]: v * 100 for k, v in labeller_classifier_to_stdev.items() if k[1] == 'Random' } best_acc = { k[2]: v * 100 for k, v in labeller_classifier_to_accuracy.items() if k[1] == 'Groundtruth' } best_stdev = { k[2]: v * 100 for k, v in labeller_classifier_to_stdev.items() if k[1] == 'Groundtruth' } print('Best: {} +- {}'.format(best_acc, best_stdev)) print('Random: {} +- {}'.format(random_acc, random_stdev)) plt.figure() colours = ['grey', 'magenta', 'blue', 'orange', 'grey'] markers = ['o', '^', 'x', 's', '*'] handles = {} print('Data set & Labeller & Classifier & Mean accuracy (\\%)\\\\') for k, set_name in enumerate(norris_labelled_sets[1:]): if 'resolved' in set_name: # https://github.com/MatthewJA/radio/issues/22 continue k -= 1 print_set_name = titlemap[set_name] ax = plt.subplot(1, 1, 1 + k) # 22 print('{} & Norris & Perfect & ${:.02f} \\pm {:.02f}$\\\\'.format( print_set_name, best_acc[titlemap[set_name]], best_stdev[titlemap[set_name]])) print('{} & Norris & Random & ${:.02f} \\pm {:.02f}$\\\\'.format( print_set_name, random_acc[titlemap[set_name]], random_stdev[titlemap[set_name]])) plt.hlines(best_acc[titlemap[set_name]], -0.5, 2.5, linestyles='solid', colors='green', linewidth=1, zorder=1) plt.fill_between( [-1, 2], [best_acc[titlemap[set_name]] - best_stdev[titlemap[set_name]]] * 2, [best_acc[titlemap[set_name]] + best_stdev[titlemap[set_name]]] * 2, linestyle='dashed', color='green', alpha=0.2, linewidth=1, zorder=1) plt.hlines(random_acc[titlemap[set_name]], -0.5, 2.5, linestyles='solid', colors='blue', linewidth=1, zorder=1, alpha=0.7) plt.fill_between([-1, 2], [ random_acc[titlemap[set_name]] - random_stdev[titlemap[set_name]] ] * 2, [ random_acc[titlemap[set_name]] + random_stdev[titlemap[set_name]] ] * 2, linestyle='dashed', color='blue', alpha=0.2, linewidth=1, zorder=1) for i, labeller in enumerate(['Norris', 'RGZ']): for j, classifier in enumerate( ['LogisticRegression', 'CNN', 'RandomForestClassifier'] + (['Label', 'NearestNeighbour'] if field == 'cdfs' else ['NearestNeighbour'])): ys = numpy.array(labeller_classifier_to_accuracies[ labeller, classifier, titlemap[set_name]]) * 100 if classifier != 'NearestNeighbour': x_offset = i + ( j - 1 ) / 5 if labeller == 'Norris' or field == 'elais' else i + ( j - 1.5) / 6 else: # NN plt.axhline(numpy.mean(ys), color='grey', linestyle='-.', linewidth=1) if field == 'cdfs': plt.fill_between([-1, 2], [numpy.mean(ys) - numpy.std(ys)] * 2, [numpy.mean(ys) + numpy.std(ys)] * 2, color='grey', linestyle='-.', alpha=0.2, linewidth=1) x_offset = 2 if classifier == 'Label' and labeller == 'RGZ': plt.annotate('{:.1%}'.format(numpy.mean(ys) / 100), (x_offset, 72.5), ha='center', va='bottom') plt.arrow(x_offset, 72.5, 0, -1.5, head_width=0.05, head_length=1, ec='k', fc='k') xs = [x_offset] * len(ys) print('{} & {} & {} & ${:.02f} \\pm {:.02f}$\\\\'.format( print_set_name, labeller, classifier, numpy.mean(ys), numpy.std(ys))) ax.set_xlim((-0.5, 1.5)) ax.set_ylim((70, 100)) ax.set_xticks([0, 1]) ax.set_xticklabels(['Norris', 'RGZ']) ax.set_yticklabels( ['{}\%'.format(x) for x in range(70, 101, 5)]) handles[j] = plt.scatter(xs, ys, color=colours[j], marker=markers[j], zorder=2, edgecolor='k', linewidth=1) # if k == 0: # 22 # plt.xlabel('Labels') plt.ylabel('Cross-identification\naccuracy (per cent)'.format( titlemap[set_name])) # ax.title.set_fontsize(16) # ax.xaxis.label.set_fontsize(12) # ax.yaxis.label.set_fontsize(9) # for tick in ax.get_xticklabels() + ax.get_yticklabels(): # tick.set_fontsize(10) ax.grid(which='major', axis='y', color='#DDDDDD') # Print the table. print('\\hline') print('Labeller & Classifier & Mean `Resolved\' & Mean `All\'\\') print('&& accuracy (per cent) & accuracy (per cent)\\') print('\\hline') for labeller in ['Norris', 'RGZ']: for classifier in [ 'CNN', 'LogisticRegression', 'RandomForestClassifier', 'Groundtruth', 'Random', 'Label', 'NearestNeighbour' ]: if labeller == 'RGZ' and classifier in { 'Groundtruth', 'Random', 'NearestNeighbour' }: continue if labeller == 'Norris' and classifier == 'Label': continue print( '{} & {} & ${:.1f} \\pm {:.1f}$ & ${:.1f} \\pm {:.1f}$'.format( labeller, classifier, numpy.array( labeller_classifier_to_accuracies[labeller, classifier, 'Resolved']).mean() * 100, numpy.array( labeller_classifier_to_accuracies[labeller, classifier, 'Resolved']).std() * 100, numpy.array( labeller_classifier_to_accuracies[labeller, classifier, 'All']).mean() * 100, numpy.array( labeller_classifier_to_accuracies[labeller, classifier, 'All']).std() * 100)) plt.gca().tick_params(axis='both', which='major', direction='out', length=5) plt.gca().tick_params(axis='y', which='minor', direction='out', length=3) plt.gca().minorticks_on() plt.figlegend([handles[j] for j in sorted(handles)], ['LR', 'CNN', 'RF'] + (['Labels'] if field == 'cdfs' else []), 'lower center', ncol=4) plt.subplots_adjust(bottom=0.25, hspace=0.25, left=0.3) plt.savefig('../images/{}_cross_identification_grid.pdf'.format(field)) plt.savefig('../images/{}_cross_identification_grid.png'.format(field))
def main(classifier='CNN', labeller='Norris'): # Load SWIRE stuff. swire_names, swire_coords, swire_features = pipeline.generate_swire_features( overwrite=False) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False) _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, overwrite=False) swire_tree = KDTree(swire_coords) swire_name_to_index = {n: i for i, n in enumerate(swire_names)} atlas_names = [] atlas_compactnesses = [] atlas_coords = [] atlas_norris_swire = [] table = astropy.io.ascii.read(pipeline.TABLE_PATH) for row in table: name = row['Component Name (Franzen)'] if not name: continue if not row['Component Zooniverse ID (RGZ)']: continue compactness = pipeline.compactness(row) atlas_names.append(name) atlas_compactnesses.append(compactness) atlas_coords.append( (row['Component RA (Franzen)'], row['Component DEC (Franzen)'])) atlas_norris_swire.append(row['Source SWIRE (Norris)']) ys = [] xs_entropy = [] xs_margin = [] no_groundtruth = [] correct = [] for name, compactness, coords, swire in zip(atlas_names, atlas_compactnesses, atlas_coords, atlas_norris_swire): predictor_name = '{}_{}'.format(classifier, labeller) predictions = get_predictions(swire_tree, swire_coords, numpy.array(swire_names), swire_test_sets, coords, predictor_name) if not predictions: print('No predictions for {}'.format(name)) continue chosen_swire = predictions[numpy.argmax([p for _, p in predictions])][0] predictions = [p for _, p in predictions] predictions_softmax = [ numpy.exp(p) / sum(numpy.exp(p) for p in predictions) for p in predictions ] if len(predictions_softmax) == 1: entropy_ambiguity = 0 margin_ambiguity = 0 else: entropy_ambiguity = -sum(p * numpy.log(p) for p in predictions_softmax if p) predictions.sort() margin_ambiguity = 1 - (predictions[-1] - predictions[-2]) ys.append(compactness) xs_entropy.append(entropy_ambiguity) xs_margin.append(margin_ambiguity) no_groundtruth.append(not swire or not swire.startswith('SWIRE')) correct.append(swire == chosen_swire) ys = numpy.array(ys) xs_margin = numpy.array(xs_margin) xs_entropy = numpy.array(xs_entropy) no_groundtruth = numpy.array(no_groundtruth, dtype=bool) correct = numpy.array(correct, dtype=bool) print(sum(1 for y in ys if y <= 1)) plt.subplot(1, 2, 1) plt.scatter(xs_margin[no_groundtruth], ys[no_groundtruth], marker='x', color='black', alpha=0.05) plt.scatter(xs_margin[~no_groundtruth & correct], ys[~no_groundtruth & correct], marker='x', color='blue', alpha=0.7) plt.scatter(xs_margin[~no_groundtruth & ~correct], ys[~no_groundtruth & ~correct], marker='x', color='magenta', alpha=0.7) plt.title('Margin') plt.xlabel('1 - margin') plt.ylabel('$1.3 SNR S / 10 S_p$') plt.yscale('log') plt.axhline(1, min(xs_margin), max(xs_margin)) plt.subplot(1, 2, 2) plt.scatter(xs_entropy[no_groundtruth], ys[no_groundtruth], marker='x', color='black', alpha=0.05) plt.scatter(xs_entropy[~no_groundtruth & correct], ys[~no_groundtruth & correct], marker='x', color='blue', alpha=0.7) plt.scatter(xs_entropy[~no_groundtruth & ~correct], ys[~no_groundtruth & ~correct], marker='x', color='magenta', alpha=0.7) plt.title('Entropy') plt.xlabel('Entropy') plt.ylabel('$1.3 SNR S / 10 S_p$') plt.yscale('log') plt.axhline(1, min(xs_entropy), max(xs_entropy), zorder=-100, linestyle='--', color='black') plt.show()
def plot(field='cdfs'): log.debug('Getting SWIRE, ATLAS features.') swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field=field) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field=field) log.debug('Calling cross-identify.') cids = list(pipeline.cross_identify_all(swire_names, swire_coords, swire_labels, swire_test_sets, swire_labels[:, 0], field=field)) # Also load the nearest-neighbour cross-identifications. cids += [pipeline.CrossIdentifications.from_hdf5( pipeline.WORKING_DIR + 'NearestNeighbour_{}_cross_ids_{}_RGZ & Norris.h5'.format(field, q)) for q in range(4 if field == 'cdfs' else 1)] swire_tree = scipy.spatial.KDTree(swire_coords[swire_test_sets[:, 0, 0]]) failed_coords = [] if field == 'cdfs': table = astropy.io.ascii.read(pipeline.TABLE_PATH) rgzcat = astropy.io.ascii.read(pipeline.RGZ_PATH) atlas_to_swire_expert = {} atlas_to_swire_rgz = {} key_to_atlas = {} atlas_id_to_name = {} is_compact = {} for row in table: name = row['Component Name (Franzen)'] key_to_atlas[row['Key']] = name swire = row['Source SWIRE (Norris)'] if not swire or not swire.startswith('SWIRE') or not name: continue atlas_id_to_name[row['Component ID (Franzen)']] = name atlas_to_swire_expert[name] = swire is_compact[name] = pipeline.compact_test(row) for row in rgzcat: swire_name = row['SWIRE.designation'] if not swire_name or swire_name == '-99': continue name = atlas_id_to_name.get(row['atlas_id'], None) atlas_to_swire_rgz[name] = swire_name else: atlas_to_swire_expert = {} with astropy.io.fits.open(pipeline.MIDDELBERG_TABLE4_PATH) as elais_components_fits: elais_components = elais_components_fits[1].data atlas_cid_to_name = {} atlas_names = [] # Indices correspond to table 4 rows. atlas_name_to_compact = {} for component in elais_components: cid = component['CID'] name = component['ATELAIS'] atlas_names.append(name) atlas_cid_to_name[cid] = name row = {'Component S (Franzen)': component['Sint'], # Fitting in with the CDFS API... 'Component S_ERR (Franzen)': component['e_Sint'], 'Component Sp (Franzen)': component['Sp'], 'Component Sp_ERR (Franzen)': component['e_Sp']} atlas_name_to_compact[name] = pipeline.compact_test(row) with open(pipeline.MIDDELBERG_TABLE5_PATH) as elais_file: # Took this code from pipeline.py, probably should make it a function lines = [line.split('|') for line in elais_file] for line in lines: if 'ATELAISJ' not in line[0]: continue line_cids = line[1] if 'C0' not in line_cids and 'C1' not in line_cids: continue line_cids = [cid.strip() for cid in line_cids.split(',')] swire_coord_re = re.search(r'SWIRE4J(\d\d)(\d\d)(\d\d\.\d\d)(-\d\d)(\d\d)(\d\d\.\d)', line[2]) if not swire_coord_re: continue swire_coord_list = swire_coord_re.groups() coord = astropy.coordinates.SkyCoord( ra='{} {} {}'.format(*swire_coord_list[:3]), dec='{} {} {}'.format(*swire_coord_list[3:]), unit=('hourangle', 'deg')) coord = (coord.ra.deg, coord.dec.deg) # Nearest SWIRE... dist, nearest = swire_tree.query(coord) if dist > 5 / 60 / 60: logging.debug('No SWIRE match found for Middelberg cross-identification {}'.format(line[0])) logging.debug('Nearest is {} ({:.01f} arcsec)'.format(numpy.array(swire_names)[swire_test_sets[:, 0, 0]][nearest], dist * 60 * 60)) logging.debug('Middelberg: {}'.format(swire_coord_re.group())) failed_coords.append(coord) continue name = numpy.array(swire_names)[swire_test_sets[:, 0, 0]][nearest] for cid in line_cids: atlas_to_swire_expert[atlas_cid_to_name[cid]] = name labeller_classifier_to_accuracies = collections.defaultdict(list) # Augment the CIDs by duplicating the "resolved" cross-ids to make the "all" set. resolved_cids_copy = [copy.copy(cid) for cid in cids if 'resolved' in cid.dataset_name] for cid in resolved_cids_copy: cid.dataset_name = cid.dataset_name.replace(' & resolved', '') cids.extend(resolved_cids_copy) for cid in cids: if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name: continue if cid.classifier in {'Groundtruth', 'Random', 'NearestNeighbour'}: # Deal with these later as they are special. continue atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) n_total = 0 n_correct = 0 n_skipped = 0 n_compact = 0 if field == 'cdfs': atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES[whatset[cid.dataset_name]], cid.quadrant].nonzero()[0] # For each ATLAS object in RGZ & Norris... for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_expert: n_skipped += 1 continue if name not in atlas_to_swire_predictor: n_skipped += 1 continue swire_norris = atlas_to_swire_expert[name] swire_predictor = atlas_to_swire_predictor[name] n_correct += swire_norris == swire_predictor n_total += 1 else: # Only one test set for ELAIS. atlas_indices = atlas_test_sets[:, 0, 0].nonzero()[0] assert atlas_test_sets.shape[0] == len(atlas_names) for index in atlas_indices: # Screen resolved here. atlas_name = atlas_names[index] if atlas_name not in atlas_to_swire_expert: n_skipped += 1 continue if atlas_name not in atlas_to_swire_predictor: n_skipped += 1 continue if 'resolved' in cid.dataset_name and atlas_name_to_compact[atlas_name]: n_compact += 1 continue swire_middelberg = atlas_to_swire_expert[atlas_name] swire_predictor = atlas_to_swire_predictor[atlas_name] n_correct += swire_middelberg == swire_predictor n_total += 1 # print('Compact: {:.02%}'.format(n_compact / (n_total + n_compact))) if 'Norris' in cid.dataset_name and cid.labeller == 'rgz': labeller = 'RGZ N' elif cid.labeller == 'rgz': labeller = 'RGZ' else: labeller = 'Norris' labeller_classifier_to_accuracies[labeller, cid.classifier, titlemap[cid.dataset_name]].append(n_correct / n_total) # Groundtruth, random, and NN classifiers exist only for the RGZ & Norris set, but we want to test on all subsets. # This section duplicates the classifiers and evaluates them on all subsets. for cid in cids: if cid.classifier not in {'Groundtruth', 'Random', 'NearestNeighbour'}: continue for dataset_name in ['RGZ & Norris', 'RGZ & Norris & resolved', 'RGZ & Norris & compact']: atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) n_total = 0 n_correct = 0 n_skipped = 0 if field == 'cdfs': # For each ATLAS object in RGZ & Norris... atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES[dataset_name], cid.quadrant].nonzero()[0] for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_expert: n_skipped += 1 continue if name not in atlas_to_swire_predictor: n_skipped += 1 continue swire_norris = atlas_to_swire_expert[name] swire_predictor = atlas_to_swire_predictor[name] n_correct += swire_norris == swire_predictor if cid.classifier == 'NearestNeighbour' and swire_norris != swire_predictor: pass n_total += 1 else: atlas_indices = atlas_test_sets[:, 0, 0].nonzero()[0] assert atlas_test_sets.shape[0] == len(atlas_names) for index in atlas_indices: # Screen resolved here (because the test sets aren't useful for that for ELAIS) atlas_name = atlas_names[index] if 'resolved' in dataset_name and atlas_name_to_compact[atlas_name]: continue if atlas_name not in atlas_to_swire_expert: n_skipped += 1 continue if atlas_name not in atlas_to_swire_predictor: n_skipped += 1 continue swire_middelberg = atlas_to_swire_expert[atlas_name] swire_predictor = atlas_to_swire_predictor[atlas_name] n_correct += swire_middelberg == swire_predictor n_total += 1 if 'Norris' in cid.dataset_name and cid.labeller == 'rgz': labeller = 'RGZ N' elif cid.labeller == 'rgz': labeller = 'RGZ' else: labeller = 'Norris' print(labeller, cid.classifier, titlemap[dataset_name], n_correct, n_total, n_correct / n_total) labeller_classifier_to_accuracies[labeller, cid.classifier, titlemap[dataset_name]].append(n_correct / n_total) if field == 'cdfs': # Compute accuracy for RGZ. for dataset_name in pipeline.SET_NAMES: for quadrant in range(4): # N.B. Disabled using the pipeline for RGZ. # # Compact objects are cross-identified in a separate pipeline, which is slow so I don't want to reproduce it here. # # So I'll read the compact object cross-identifications from the LR(RGZ) cross-identification set, since it ought # # to be the same. # corresponding_set, = [cid for cid in cids if cid.quadrant == quadrant # and cid.dataset_name == dataset_name # and cid.labeller == 'rgz' # and cid.classifier == 'LogisticRegression'] # atlas_to_swire_lr = dict(zip(corresponding_set.radio_names, corresponding_set.ir_names)) n_total = 0 n_correct = 0 n_skipped = 0 n_compact = 0 atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES[whatset[dataset_name]], quadrant].nonzero()[0] # For each ATLAS object in RGZ & Norris... for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_expert: n_skipped += 1 continue if name not in atlas_to_swire_rgz:# or name not in atlas_to_swire_lr: n_skipped += 1 continue if False and is_compact[name]: swire_predictor = atlas_to_swire_lr[name] else: swire_predictor = atlas_to_swire_rgz[name] swire_norris = atlas_to_swire_expert[name] n_correct += swire_norris == swire_predictor n_total += 1 labeller_classifier_to_accuracies['RGZ', 'Label', titlemap[dataset_name]].append(n_correct / n_total) labeller_classifier_to_accuracy = {} labeller_classifier_to_stdev = {} for key, accuracies in labeller_classifier_to_accuracies.items(): print('Best {}:'.format(key), max(accuracies)) labeller_classifier_to_accuracy[key] = numpy.mean(accuracies) labeller_classifier_to_stdev[key] = numpy.std(accuracies) random_acc = {k[2]: v * 100 for k, v in labeller_classifier_to_accuracy.items() if k[1] == 'Random'} random_stdev = {k[2]: v * 100 for k, v in labeller_classifier_to_stdev.items() if k[1] == 'Random'} best_acc = {k[2]: v * 100 for k, v in labeller_classifier_to_accuracy.items() if k[1] == 'Groundtruth'} best_stdev = {k[2]: v * 100 for k, v in labeller_classifier_to_stdev.items() if k[1] == 'Groundtruth'} print('Best: {} +- {}'.format(best_acc, best_stdev)) print('Random: {} +- {}'.format(random_acc, random_stdev)) plt.figure() colours = ['grey', 'magenta', 'blue', 'orange', 'grey'] markers = ['o', '^', 'x', 's', '*'] handles = {} print('Data set & Labeller & Classifier & Mean accuracy (\\%)\\\\') for k, set_name in enumerate(norris_labelled_sets[1:]): if 'resolved' in set_name: # https://github.com/MatthewJA/radio/issues/22 continue k -= 1 print_set_name = titlemap[set_name] ax = plt.subplot(1, 1, 1 + k) # 22 print('{} & Norris & Perfect & ${:.02f} \\pm {:.02f}$\\\\'.format(print_set_name, best_acc[titlemap[set_name]], best_stdev[titlemap[set_name]])) print('{} & Norris & Random & ${:.02f} \\pm {:.02f}$\\\\'.format(print_set_name, random_acc[titlemap[set_name]], random_stdev[titlemap[set_name]])) plt.hlines(best_acc[titlemap[set_name]], -0.5, 2.5, linestyles='solid', colors='green', linewidth=1, zorder=1) plt.fill_between([-1, 2], [best_acc[titlemap[set_name]] - best_stdev[titlemap[set_name]]] * 2, [best_acc[titlemap[set_name]] + best_stdev[titlemap[set_name]]] * 2, linestyle='dashed', color='green', alpha=0.2, linewidth=1, zorder=1) plt.hlines(random_acc[titlemap[set_name]], -0.5, 2.5, linestyles='solid', colors='blue', linewidth=1, zorder=1, alpha=0.7) plt.fill_between([-1, 2], [random_acc[titlemap[set_name]] - random_stdev[titlemap[set_name]]] * 2, [random_acc[titlemap[set_name]] + random_stdev[titlemap[set_name]]] * 2, linestyle='dashed', color='blue', alpha=0.2, linewidth=1, zorder=1) for i, labeller in enumerate(['Norris', 'RGZ']): for j, classifier in enumerate(['LogisticRegression', 'CNN', 'RandomForestClassifier'] + (['Label', 'NearestNeighbour'] if field == 'cdfs' else ['NearestNeighbour'])): ys = numpy.array(labeller_classifier_to_accuracies[labeller, classifier, titlemap[set_name]]) * 100 if classifier != 'NearestNeighbour': x_offset = i + (j - 1) / 5 if labeller == 'Norris' or field == 'elais' else i + (j - 1.5) / 6 else: # NN plt.axhline(numpy.mean(ys), color='grey', linestyle='-.', linewidth=1) if field == 'cdfs': plt.fill_between([-1, 2], [numpy.mean(ys) - numpy.std(ys)] * 2, [numpy.mean(ys) + numpy.std(ys)] * 2, color='grey', linestyle='-.', alpha=0.2, linewidth=1) x_offset = 2 if classifier == 'Label' and labeller == 'RGZ': plt.annotate('{:.1%}'.format(numpy.mean(ys) / 100), (x_offset, 72.5), ha='center', va='bottom') plt.arrow(x_offset, 72.5, 0, -1.5, head_width=0.05, head_length=1, ec='k', fc='k') xs = [x_offset] * len(ys) print('{} & {} & {} & ${:.02f} \\pm {:.02f}$\\\\'.format(print_set_name, labeller, classifier, numpy.mean(ys), numpy.std(ys))) ax.set_xlim((-0.5, 1.5)) ax.set_ylim((70, 100)) ax.set_xticks([0, 1]) ax.set_xticklabels(['Norris', 'RGZ']) ax.set_yticklabels(['{}\%'.format(x) for x in range(70, 101, 5)]) handles[j] = plt.scatter(xs, ys, color=colours[j], marker=markers[j], zorder=2, edgecolor='k', linewidth=1) # if k == 0: # 22 # plt.xlabel('Labels') plt.ylabel('Cross-identification\naccuracy (per cent)'.format(titlemap[set_name])) # ax.title.set_fontsize(16) # ax.xaxis.label.set_fontsize(12) # ax.yaxis.label.set_fontsize(9) # for tick in ax.get_xticklabels() + ax.get_yticklabels(): # tick.set_fontsize(10) ax.grid(which='major', axis='y', color='#DDDDDD') # Print the table. print('\\hline') print('Labeller & Classifier & Mean `Resolved\' & Mean `All\'\\') print('&& accuracy (per cent) & accuracy (per cent)\\') print('\\hline') for labeller in ['Norris', 'RGZ']: for classifier in ['CNN', 'LogisticRegression', 'RandomForestClassifier', 'Groundtruth', 'Random', 'Label', 'NearestNeighbour']: if labeller == 'RGZ' and classifier in {'Groundtruth', 'Random', 'NearestNeighbour'}: continue if labeller == 'Norris' and classifier == 'Label': continue print('{} & {} & ${:.1f} \\pm {:.1f}$ & ${:.1f} \\pm {:.1f}$'.format( labeller, classifier, numpy.array( labeller_classifier_to_accuracies[labeller, classifier, 'Resolved']).mean() * 100, numpy.array( labeller_classifier_to_accuracies[labeller, classifier, 'Resolved']).std() * 100, numpy.array( labeller_classifier_to_accuracies[labeller, classifier, 'All']).mean() * 100, numpy.array( labeller_classifier_to_accuracies[labeller, classifier, 'All']).std() * 100)) plt.gca().tick_params(axis='both', which='major', direction='out', length=5) plt.gca().tick_params(axis='y', which='minor', direction='out', length=3) plt.gca().minorticks_on() plt.figlegend([handles[j] for j in sorted(handles)], ['LR', 'CNN', 'RF'] + (['Labels'] if field == 'cdfs' else []), 'lower center', ncol=4) plt.subplots_adjust(bottom=0.25, hspace=0.25, left=0.3) plt.savefig('../images/{}_cross_identification_grid.pdf'.format(field)) plt.savefig('../images/{}_cross_identification_grid.png'.format(field))
def print_table(field='cdfs'): swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field=field) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field=field) cids = list(pipeline.cross_identify_all(swire_names, swire_coords, swire_labels, swire_test_sets, swire_labels[:, 0], field=field)) atlas_to_swire = collections.defaultdict(dict) # ATLAS -> predictor -> SWIRE swire_name_to_coord = {} for name, coord in zip(swire_names, swire_coords): swire_name_to_coord[name] = coord atlas_to_swire_expert = {} key_to_atlas = {} atlas_to_ras = {} atlas_to_decs = {} id_to_atlas = {} atlas_to_id = {} atlas_to_zooniverse_id = {} if field == 'cdfs': table = astropy.io.ascii.read(pipeline.TABLE_PATH) for row in table: name = row['Component Name (Franzen)'] if not name: continue id_to_atlas[row['Component ID (Franzen)']] = name atlas_to_id[name] = row['Component ID (Franzen)'] atlas_to_zooniverse_id[name] = row['Component Zooniverse ID (RGZ)'] key_to_atlas[row['Key']] = name swire = row['Source SWIRE (Norris)'] atlas_to_swire_expert[name] = swire atlas_to_ras[name] = row['Component RA (Franzen)'] atlas_to_decs[name] = row['Component DEC (Franzen)'] else: swire_scoords = astropy.coordinates.SkyCoord(ra=swire_coords[:, 0], dec=swire_coords[:, 1], unit='deg') with astropy.io.fits.open(pipeline.MIDDELBERG_TABLE4_PATH) as elais_components_fits: elais_components = elais_components_fits[1].data component_to_name = {} for i, component in enumerate(elais_components): name = component['ATELAIS'] id_to_atlas[component['CID']] = name atlas_to_id[name] = component['CID'] atlas_to_zooniverse_id[name] = '' key_to_atlas[i] = name coord = astropy.coordinates.SkyCoord( ra='{} {} {}'.format(component['RAh'], component['RAm'], component['RAs']), dec='-{} {} {}'.format(component['DEd'], component['DEm'], component['DEs']), unit=('hourangle', 'deg')) coord = (coord.ra.deg, coord.dec.deg) atlas_to_ras[name] = coord[0] atlas_to_decs[name] = coord[1] # Load SWIRE cross-identification from Table 5. with open(pipeline.MIDDELBERG_TABLE5_PATH) as elais_file: lines = [line.split('|') for line in elais_file] for line in lines: if 'ATELAISJ' not in line[0]: continue line_cids = line[1] if 'C0' not in line_cids and 'C1' not in line_cids: continue line_cids = [cid.strip() for cid in line_cids.split(',')] swire_coord_re = re.search(r'SWIRE4J(\d\d)(\d\d)(\d\d\.\d\d)(-\d\d)(\d\d)(\d\d\.\d)', line[2]) if not swire_coord_re: continue swire_coord_list = swire_coord_re.groups() coord = astropy.coordinates.SkyCoord( ra='{} {} {}'.format(*swire_coord_list[:3]), dec='{} {} {}'.format(*swire_coord_list[3:]), unit=('hourangle', 'deg')) # Nearest SWIRE... seps = coord.separation(swire_scoords) nearest = numpy.argmin(seps) dist = seps[nearest] if dist.deg > 5 / 60 / 60: continue name = swire_names[nearest] for cid in line_cids: atlas_to_swire_expert[id_to_atlas[cid]] = name atlas_to_rgz = {} atlas_to_radio_consensus = {} atlas_to_ir_consensus = {} if field == 'cdfs': for row in astropy.io.ascii.read(pipeline.RGZ_PATH): name = id_to_atlas[row['atlas_id']] atlas_to_radio_consensus[name] = row['consensus.radio_level'] atlas_to_ir_consensus[name] = row['consensus.ir_level'] atlas_to_rgz[name] = row['SWIRE.designation'] titlemap = { 'RGZ & Norris & compact': 'Compact', 'RGZ & Norris & resolved': 'Resolved', 'RGZ & Norris': 'All', 'RGZ & compact': 'Compact', 'RGZ & resolved': 'Resolved', 'RGZ': 'All', } known_predictors = set() for cid in cids: if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name: continue if cid.classifier in {'Groundtruth', 'Random'}: continue if field == 'cdfs': atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'], cid.quadrant].nonzero()[0] else: atlas_keys = atlas_test_sets[:, 0, 0].nonzero()[0] atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) n_total = 0 n_correct = 0 n_skipped = 0 if 'Norris' in cid.dataset_name and cid.labeller == 'rgz': labeller = 'RGZ N' continue elif cid.labeller == 'rgz': labeller = 'RGZ' else: labeller = 'Norris' predictor_name = '{}({} / {})'.format( {'LogisticRegression': 'LR', 'CNN': 'CNN', 'RandomForestClassifier': 'RF'}[cid.classifier], labeller, titlemap[cid.dataset_name]) known_predictors.add(predictor_name) for i in atlas_keys: name = key_to_atlas[i] swire_predictor = atlas_to_swire_predictor.get(name, '') atlas_to_swire[name][predictor_name] = swire_predictor known_predictors = sorted(known_predictors) atlases = sorted(atlas_to_swire) ras = [] decs = [] expert_xids = [] expert_xid_ras = [] expert_xid_decs = [] rgzs = [] rgz_ras = [] rgz_decs = [] rcs = [] ircs = [] cids = [] zids = [] predictor_columns = collections.defaultdict(list) predictor_ras = collections.defaultdict(list) predictor_decs = collections.defaultdict(list) for atlas in atlases: for predictor in known_predictors: predictor_columns[predictor].append(atlas_to_swire[atlas].get(predictor, '')) predictor_ras[predictor].append(swire_name_to_coord.get(atlas_to_swire[atlas].get(predictor, ''), (None, None))[0]) predictor_decs[predictor].append(swire_name_to_coord.get(atlas_to_swire[atlas].get(predictor, ''), (None, None))[1]) ras.append(atlas_to_ras[atlas]) decs.append(atlas_to_decs[atlas]) rgzs.append(atlas_to_rgz.get(atlas, '')) cids.append(atlas_to_id[atlas]) zids.append(atlas_to_zooniverse_id[atlas]) rgz_ras.append(swire_name_to_coord.get(atlas_to_rgz.get(atlas, ''), (None, None))[0]) rgz_decs.append(swire_name_to_coord.get(atlas_to_rgz.get(atlas, ''), (None, None))[1]) rcs.append(atlas_to_radio_consensus.get(atlas, 0.0)) ircs.append(atlas_to_ir_consensus.get(atlas, 0.0)) expert_xids.append(atlas_to_swire_expert.get(atlas, '')) expert_xid_ras.append(swire_name_to_coord.get(atlas_to_swire_expert.get(atlas, ''), (None, None))[0]) expert_xid_decs.append(swire_name_to_coord.get(atlas_to_swire_expert.get(atlas, ''), (None, None))[1]) expert = 'Norris' if field == 'cdfs' else 'Middelberg' table = astropy.table.Table( data=[atlases, ras, decs, cids, zids, expert_xids, expert_xid_ras, expert_xid_decs, rgzs, rgz_ras, rgz_decs, rcs, ircs] + [k for p in known_predictors for k in (predictor_columns[p], predictor_ras[p], predictor_decs[p])], names=['ATLAS', 'RA', 'Dec', 'CID', 'Zooniverse ID', expert, expert + ' RA', expert + ' Dec', 'RGZ', 'RGZ RA', 'RGZ Dec', 'RGZ radio consensus', 'RGZ IR consensus'] + [k for p in known_predictors for k in (p, p + ' RA', p + ' Dec')]) table['RGZ radio consensus'].format = '{:.4f}' table['RGZ IR consensus'].format = '{:.4f}' table.write('/Users/alger/data/Crowdastro/predicted_cross_ids_table_21_03_18_{}.csv'.format(field), format='csv') table.write('/Users/alger/data/Crowdastro/predicted_cross_ids_table_21_03_18_{}.tex'.format(field), format='latex')
def main(classifier='CNN', labeller='Norris'): # Load SWIRE stuff. swire_names, swire_coords, swire_features = pipeline.generate_swire_features(overwrite=False) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False) _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, overwrite=False) swire_tree = KDTree(swire_coords) swire_name_to_index = {n: i for i, n in enumerate(swire_names)} atlas_names = [] atlas_compactnesses = [] atlas_coords = [] atlas_norris_swire = [] table = astropy.io.ascii.read(pipeline.TABLE_PATH) for row in table: name = row['Component Name (Franzen)'] if not name: continue if not row['Component Zooniverse ID (RGZ)']: continue compactness = pipeline.compactness(row) atlas_names.append(name) atlas_compactnesses.append(compactness) atlas_coords.append((row['Component RA (Franzen)'], row['Component DEC (Franzen)'])) atlas_norris_swire.append(row['Source SWIRE (Norris)']) ys = [] xs_entropy = [] xs_margin = [] no_groundtruth = [] correct = [] for name, compactness, coords, swire in zip(atlas_names, atlas_compactnesses, atlas_coords, atlas_norris_swire): predictor_name = '{}_{}'.format(classifier, labeller) predictions = get_predictions(swire_tree, swire_coords, numpy.array(swire_names), swire_test_sets, coords, predictor_name) if not predictions: print('No predictions for {}'.format(name)) continue chosen_swire = predictions[numpy.argmax([p for _, p in predictions])][0] predictions = [p for _, p in predictions] predictions_softmax = [numpy.exp(p) / sum(numpy.exp(p) for p in predictions) for p in predictions] if len(predictions_softmax) == 1: entropy_ambiguity = 0 margin_ambiguity = 0 else: entropy_ambiguity = -sum(p * numpy.log(p) for p in predictions_softmax if p) predictions.sort() margin_ambiguity = 1 - (predictions[-1] - predictions[-2]) ys.append(compactness) xs_entropy.append(entropy_ambiguity) xs_margin.append(margin_ambiguity) no_groundtruth.append(not swire or not swire.startswith('SWIRE')) correct.append(swire == chosen_swire) ys = numpy.array(ys) xs_margin = numpy.array(xs_margin) xs_entropy = numpy.array(xs_entropy) no_groundtruth = numpy.array(no_groundtruth, dtype=bool) correct = numpy.array(correct, dtype=bool) print(sum(1 for y in ys if y <= 1)) plt.subplot(1, 2, 1) plt.scatter(xs_margin[no_groundtruth], ys[no_groundtruth], marker='x', color='black', alpha=0.05) plt.scatter(xs_margin[~no_groundtruth & correct], ys[~no_groundtruth & correct], marker='x', color='blue', alpha=0.7) plt.scatter(xs_margin[~no_groundtruth & ~correct], ys[~no_groundtruth & ~correct], marker='x', color='magenta', alpha=0.7) plt.title('Margin') plt.xlabel('1 - margin') plt.ylabel('$1.3 SNR S / 10 S_p$') plt.yscale('log') plt.axhline(1, min(xs_margin), max(xs_margin)) plt.subplot(1, 2, 2) plt.scatter(xs_entropy[no_groundtruth], ys[no_groundtruth], marker='x', color='black', alpha=0.05) plt.scatter(xs_entropy[~no_groundtruth & correct], ys[~no_groundtruth & correct], marker='x', color='blue', alpha=0.7) plt.scatter(xs_entropy[~no_groundtruth & ~correct], ys[~no_groundtruth & ~correct], marker='x', color='magenta', alpha=0.7) plt.title('Entropy') plt.xlabel('Entropy') plt.ylabel('$1.3 SNR S / 10 S_p$') plt.yscale('log') plt.axhline(1, min(xs_entropy), max(xs_entropy), zorder=-100, linestyle='--', color='black') plt.show()
def get_examples(): titlemap = { 'RGZ & Norris & compact': 'Compact', 'RGZ & Norris & resolved': 'Resolved', 'RGZ & Norris': 'All', 'RGZ & compact': 'Compact', 'RGZ & resolved': 'Resolved', 'RGZ': 'All', } swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False) swire_labels = pipeline.generate_swire_labels(swire_names, overwrite=False) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, overwrite=False) cids = list(pipeline.cross_identify_all(swire_names, swire_coords, swire_test_sets, swire_labels[:, 0])) table = astropy.io.ascii.read(pipeline.TABLE_PATH) atlas_to_swire_norris = {} key_to_atlas = {} atlas_to_ras = {} atlas_to_decs = {} id_to_atlas = {} atlas_to_zid = {} atlas_to_id = {} for row in table: name = row['Component Name (Franzen)'] if not name: continue id_to_atlas[row['Component ID (Franzen)']] = name key_to_atlas[row['Key']] = name swire = row['Source SWIRE (Norris)'] atlas_to_swire_norris[name] = swire atlas_to_id[name] = row['Component ID (Franzen)'] atlas_to_ras[name] = row['Component RA (Franzen)'] atlas_to_decs[name] = row['Component DEC (Franzen)'] atlas_to_zid[name] = row['Component Zooniverse ID (RGZ)'] atlas_to_rgz = {} atlas_to_radio_consensus = {} atlas_to_ir_consensus = {} for row in astropy.io.ascii.read(pipeline.RGZ_PATH): name = id_to_atlas[row['atlas_id']] atlas_to_radio_consensus[name] = row['consensus.radio_level'] atlas_to_ir_consensus[name] = row['consensus.ir_level'] atlas_to_rgz[name] = row['SWIRE.designation'] cross_identifications = collections.defaultdict(dict) # ATLAS -> labeller -> SWIRE for cid in cids: if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name: continue atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names)) # For each ATLAS object in RGZ & Norris... atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'], cid.quadrant].nonzero()[0] if 'Norris' in cid.dataset_name and cid.labeller == 'rgz': labeller = 'RGZ N' elif cid.labeller == 'rgz': labeller = 'RGZ' else: labeller = 'Norris' for i in atlas_keys: name = key_to_atlas[i] if name not in atlas_to_swire_norris: continue if name not in atlas_to_swire_predictor: continue cross_identifications[name]['Norris'] = atlas_to_swire_norris[name] cross_identifications[name]['RGZ'] = atlas_to_rgz.get(name, None) cross_identifications[name][labeller, cid.classifier, titlemap[cid.dataset_name]] = atlas_to_swire_predictor[name] """ For each classifier, pull out examples where: - RGZ and Norris agree, but the classifier disagrees. Only include RGZ & Norris dataset ("All"). """ classifier_to_example = collections.defaultdict(set) for atlas, cids_ in cross_identifications.items(): if cids_['Norris'] != cids_['RGZ']: continue for classifier, swire in cids_.items(): if classifier[2] != 'All' or classifier[1] in {'Random', 'Groundtruth'}: continue if swire != cids_['Norris']: classifier_to_example[classifier].add((atlas, atlas_to_zid[atlas], atlas_to_id[atlas])) return classifier_to_example
def plot_predictions(cut=0.95, labeller='norris', dataset_name=None, classifier=None): """Plot colour-colour diagram for predicted host galaxies. labeller in {'norris', 'rgz'} dataset_name in {'RGZ & Norris', ...} """ with h5py.File(CROWDASTRO_PATH, 'r') as f: swire_numeric_cdfs = f['/swire/cdfs/numeric'][:, 2:2 + 4] f_36 = swire_numeric_cdfs[:, 0] f_45 = swire_numeric_cdfs[:, 1] f_58 = swire_numeric_cdfs[:, 2] f_80 = swire_numeric_cdfs[:, 3] detection_58 = (f_58 != -99) detection_80 = (f_80 != -99) p = pipeline.unserialise_predictions( pipeline.WORKING_DIR + '{}_{}_cdfs_predictions'.format(classifier, labeller)) predictions = {} for i in p: predictions[i.dataset_name, i.quadrant] = i swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field='cdfs') swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field='cdfs') _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field='cdfs') xs = [] ys = [] colours = [] for q in range(4): swire_set = swire_test_sets[:, pipeline.SET_NAMES['RGZ'], q] if labeller == 'norris' and not dataset_name: # predictions_set = predictions['RGZ & Norris', q].probabilities > cut f_36_ = f_36[swire_set & swire_labels[:, 0]]#[predictions_set] f_45_ = f_45[swire_set & swire_labels[:, 0]]#[predictions_set] f_58_ = f_58[swire_set & swire_labels[:, 0]]#[predictions_set] f_80_ = f_80[swire_set & swire_labels[:, 0]]#[predictions_set] elif labeller == 'rgz' and not dataset_name: f_36_ = f_36[swire_set & swire_labels[:, 1]] f_45_ = f_45[swire_set & swire_labels[:, 1]] f_58_ = f_58[swire_set & swire_labels[:, 1]] f_80_ = f_80[swire_set & swire_labels[:, 1]] if labeller == 'norris' and dataset_name: predictions_set = predictions[dataset_name, q].probabilities > cut f_36_ = f_36[swire_set][predictions_set] f_45_ = f_45[swire_set][predictions_set] f_58_ = f_58[swire_set][predictions_set] f_80_ = f_80[swire_set][predictions_set] probabilities = predictions[dataset_name, q].probabilities[predictions_set] detection_58_ = (f_58_ != -99) detection_80_ = (f_80_ != -99) detection_all_ = detection_58_ & detection_80_ ratio_58_36 = numpy.log10(f_58_[detection_all_] / f_36_[detection_all_]) ratio_80_45 = numpy.log10(f_80_[detection_all_] / f_45_[detection_all_]) probabilities = probabilities[detection_all_] xs.extend(ratio_58_36) ys.extend(ratio_80_45) colours.extend(probabilities) assert len(xs) == len(ys) assert len(xs) == len(colours) plot_basic() if dataset_name: plt.scatter(xs, ys, s=20, marker='^', linewidth=0, alpha=0.5, c=numpy.array(colours), cmap='winter') else: plt.scatter(xs, ys, s=25, c='r', marker='^', linewidth=0) plt.xlim((-0.75, 1.0)) plt.ylim((-0.75, 1.0)) plt.xlabel('$\\log_{10}(S_{5.8}/S_{3.6})$') plt.ylabel('$\\log_{10}(S_{8.0}/S_{4.5})$') plt.subplots_adjust(left=0.2, bottom=0.15, right=0.95, top=0.95) plt.colorbar() plt.show()
def plot_grid(field='cdfs'): # Load predictions. lr_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'LogisticRegression_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'LogisticRegression_rgz_{}_predictions'.format(field))) rf_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'RandomForestClassifier_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'RandomForestClassifier_rgz_{}_predictions'.format(field))) cnn_predictions = itertools.chain( pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'CNN_norris_{}_predictions'.format(field)), pipeline.unserialise_predictions( pipeline.WORKING_DIR + 'CNN_rgz_{}_predictions'.format(field))) # Convert to the format we need. e.g. {'RGZ' -> [acc, acc, acc, acc]} lr_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} lr_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} rf_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} rf_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} cnn_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} cnn_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} for predictions in lr_predictions: dataset_name = predictions.dataset_name if predictions.labeller == 'norris': lr_norris_accuracies[dataset_name][ predictions.quadrant] = predictions.balanced_accuracy else: lr_rgz_accuracies[dataset_name][ predictions.quadrant] = predictions.balanced_accuracy for predictions in rf_predictions: dataset_name = predictions.dataset_name if predictions.labeller == 'norris': rf_norris_accuracies[dataset_name][ predictions.quadrant] = predictions.balanced_accuracy else: rf_rgz_accuracies[dataset_name][ predictions.quadrant] = predictions.balanced_accuracy for predictions in cnn_predictions: dataset_name = predictions.dataset_name if predictions.labeller == 'norris': cnn_norris_accuracies[dataset_name][ predictions.quadrant] = predictions.balanced_accuracy else: cnn_rgz_accuracies[dataset_name][ predictions.quadrant] = predictions.balanced_accuracy if field == 'cdfs': # Load RGZ cross-identifications and compute a balanced accuracy with them. swire_names, swire_coords, _ = pipeline.generate_swire_features( overwrite=False, field=field) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field) (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets( swire_coords, swire_labels, overwrite=False, field=field) label_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES} label_norris_accuracies = { sstr: [1] * 4 for sstr in pipeline.SET_NAMES } # By definition. for dataset_name in pipeline.SET_NAMES: for quadrant in range(4): test_set = swire_test_sets[:, pipeline.SET_NAMES[dataset_name], quadrant] predictions = swire_labels[test_set, 1] trues = swire_labels[test_set, 0] ba = balanced_accuracy(trues, predictions) label_rgz_accuracies[dataset_name][quadrant] = ba colours = ['grey', 'magenta', 'blue', 'orange'] markers = ['o', '^', 'x', 's'] handles = {} plt.figure(figsize=(5, 5)) accuracy_map = defaultdict(lambda: defaultdict(dict)) # For table output. output_sets = [ ('LR', [lr_norris_accuracies, lr_rgz_accuracies]), ('CNN', [cnn_norris_accuracies, cnn_rgz_accuracies]), ('RF', [rf_norris_accuracies, rf_rgz_accuracies]), ] if field == 'cdfs': output_sets.append( ('Labels', [label_norris_accuracies, label_rgz_accuracies])) for j, (classifier_name, classifier_set) in enumerate(output_sets): for i, set_name in enumerate(norris_labelled_sets): if 'compact' not in set_name: # Skip compact. ax = plt.subplot(2, 1, { 'RGZ & Norris & resolved': 1, 'RGZ & Norris': 2 }[set_name]) ax.set_ylim((80, 100)) ax.set_xlim((-0.5, 1.5)) ax.set_xticks([0, 1]) #, 2]) ax.set_xticklabels( [ 'Norris', # 'RGZ N', 'RGZ', ], rotation='horizontal') if i == 2: plt.xlabel('Labels') plt.ylabel('{}\nBalanced accuracy\n(per cent)'.format( titlemap[set_name])) ax.title.set_fontsize(16) ax.xaxis.label.set_fontsize(12) ax.yaxis.label.set_fontsize(9) for tick in ax.get_xticklabels() + ax.get_yticklabels(): tick.set_fontsize(10) ax.grid(which='major', axis='y', color='#EEEEEE') for k in range(4): if 'compact' in set_name: continue if j != 3: # !Labels ax.scatter([0 + (j - 1) / 5], classifier_set[0][set_name][k] * 100, color=colours[j], marker=markers[j], linewidth=1, edgecolor='k') rgz_offset = ((j - 1.5) / 6) if field == 'cdfs' else (j - 1) / 5 handles[j] = ax.scatter( [1 + rgz_offset], classifier_set[1][fullmap[set_name]][k] * 100, color=colours[j], marker=markers[j], linewidth=1, edgecolor='k') # ax.scatter([1 + (j - 1) / 5], classifier_set[1][set_name][k] * 100, # color=colours[j], marker=markers[j], linewidth=1, edgecolor='k') # Compute for table. for labeller in ['Norris', 'RGZ N', 'RGZ']: if labeller == 'Norris': mean = numpy.mean(classifier_set[0][set_name]) * 100 stdev = numpy.std(classifier_set[0][set_name]) * 100 elif labeller == 'RGZ N': continue # mean = numpy.mean(classifier_set[1][set_name]) * 100 # stdev = numpy.std(classifier_set[1][set_name]) * 100 elif labeller == 'RGZ': mean = numpy.mean( classifier_set[1][fullmap[set_name]]) * 100 stdev = numpy.std( classifier_set[1][fullmap[set_name]]) * 100 accuracy_map[labeller][classifier_name][ titlemap[set_name]] = '${:.02f} \\pm {:.02f}$'.format( mean, stdev) # Assemble table. col_labeller = [] col_classifier = [] col_compact = [] col_resolved = [] col_all = [] for labeller in ['Norris', 'RGZ N', 'RGZ']: if labeller == 'RGZ N': continue for classifier in ['CNN', 'LR', 'RF'] + ['Labels' ] if field == 'cdfs' else []: col_labeller.append(labeller) col_classifier.append(classifier) col_compact.append(accuracy_map[labeller][classifier]['Compact']) col_resolved.append(accuracy_map[labeller][classifier]['Resolved']) col_all.append(accuracy_map[labeller][classifier]['All']) out_table = astropy.table.Table( [col_labeller, col_classifier, col_compact, col_resolved, col_all], names=[ 'Labeller', 'Classifier', "Mean `Compact' accuracy\\\\(per cent)", "Mean `Resolved' accuracy\\\\(per cent)", "Mean `All' accuracy\\\\(per cent)" ]) out_table.write('../{}_accuracy_table.tex'.format(field), format='latex') plt.figlegend([handles[j] for j in sorted(handles)], ['LR', 'CNN', 'RF'] + (['Labels'] if field == 'cdfs' else []), 'lower center', ncol=4, fontsize=10) plt.subplots_adjust(bottom=0.2, hspace=0.25) plt.savefig('../images/{}_ba_grid.pdf'.format(field), bbox_inches='tight', pad_inches=0) plt.savefig('../images/{}_ba_grid.png'.format(field), bbox_inches='tight', pad_inches=0)
def main(examples=None, classifier='CNN', labeller='Norris'): # Load SWIRE stuff. swire_names, swire_coords, swire_features = pipeline.generate_swire_features( overwrite=False) swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False) _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, overwrite=False) swire_tree = KDTree(swire_coords) swire_name_to_index = {n: i for i, n in enumerate(swire_names)} # Load ATLAS coords. table = astropy.io.ascii.read(pipeline.TABLE_PATH) atlas_to_coords = {} atlas_to_swire_coords = {} for row in table: name = row['Component Name (Franzen)'] if not name: continue atlas_to_coords[name] = row['Component RA (Franzen)'], row[ 'Component DEC (Franzen)'] index = swire_name_to_index.get(row['Source SWIRE (Norris)'] or '') if index: atlas_to_swire_coords[name] = swire_coords[index] ir_stretch = astropy.visualization.LogStretch(0.001) if examples is None: examples = examples_incorrect.get_examples() examples = examples[labeller, classifier, 'All'] for example in examples: print('Plotting {}'.format(example)) predictor_name = '{}_{}'.format(classifier, labeller) cid = example[2] # Load FITS stuff. try: radio_fits = astropy.io.fits.open(CDFS_PATH + cid + '_radio.fits') except FileNotFoundError: if example[1]: # Has Zooniverse ID print('{} not in RGZ'.format(cid)) continue ir_fits = astropy.io.fits.open(CDFS_PATH + cid + '_ir.fits') wcs = astropy.wcs.WCS(radio_fits[0].header) # Compute info for contour levels. (also from Enno Middelberg) median = numpy.median(radio_fits[0].data) mad = numpy.median(numpy.abs(radio_fits[0].data - median)) sigma = mad / mad2sigma # Set up the plot. fig = plt.figure() ax = astropy.visualization.wcsaxes.WCSAxes(fig, [0.1, 0.1, 0.8, 0.8], wcs=wcs) fig.add_axes(ax) ax.set_title('{}'.format(example[0], example[1])) # Show the infrared. ax.imshow(ir_stretch(ir_fits[0].data), cmap='cubehelix_r', origin='lower') # Show the radio. ax.contour(radio_fits[0].data, colors='black', levels=[nsig * sigma * sigmult**i for i in range(15)], linewidths=1, origin='lower', zorder=1) # Plot predictions. predictions = get_predictions(swire_tree, swire_coords, swire_test_sets, atlas_to_coords[example[0]], predictor_name) if not predictions: print('No predictions for {}'.format(example[0])) continue coords = [p[0] for p in predictions] probabilities = [p[1] for p in predictions] coords = wcs.all_world2pix(coords, 1) ax.scatter(coords[:, 0], coords[:, 1], s=numpy.sqrt(numpy.array(probabilities)) * 200, color='white', edgecolor='black', linewidth=1, alpha=0.9, marker='o', zorder=2) choice = numpy.argmax(probabilities) ax.scatter(coords[choice, 0], coords[choice, 1], s=200 / numpy.sqrt(2), color='blue', marker='x', zorder=2.5) try: norris_coords, = wcs.all_world2pix( [atlas_to_swire_coords[example[0]]], 1) except KeyError: print('No Norris cross-identification for {}'.format(example[0])) continue ax.scatter(norris_coords[0], norris_coords[1], marker='+', s=200, zorder=3, color='green') lon, lat = ax.coords lon.set_major_formatter('hh:mm:ss') lon.set_axislabel('Right Ascension') lat.set_axislabel('Declination') fn = '{}_{}_{}'.format(classifier, labeller, example[0]) plt.savefig( '/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/examples/' + fn + '.png', bbox_inches='tight', pad_inches=0) plt.savefig( '/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/examples/' + fn + '.pdf', bbox_inches='tight', pad_inches=0) plt.clf()