def main():
    swire_names, swire_coords, _ = pipeline.generate_swire_features(
        overwrite=False)
    swire_labels = pipeline.generate_swire_labels(swire_names,
                                                  swire_coords,
                                                  overwrite=False)
    (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(
        swire_coords, swire_labels, overwrite=False)
    cids = list(
        pipeline.cross_identify_all(swire_names,
                                    swire_coords,
                                    swire_labels,
                                    swire_test_sets,
                                    swire_labels[:, 0],
                                    field='cdfs'))
    table = astropy.io.ascii.read(pipeline.TABLE_PATH)

    atlas_to_swire_norris = {}
    key_to_atlas = {}
    for row in table:
        name = row['Component Name (Franzen)']
        key_to_atlas[row['Key']] = name
        swire = row['Source SWIRE (Norris)']
        if not swire or not swire.startswith('SWIRE') or not name:
            continue
        atlas_to_swire_norris[name] = swire

    print(
        'Labeller\tClassifier\tQuadrant\tDataset\tn_correct\tn_total\tn_skipped\tAccuracy'
    )
    for cid in cids:
        atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
        # For each ATLAS object in RGZ & Norris...
        atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'],
                                     cid.quadrant].nonzero()[0]
        n_total = 0
        n_correct = 0
        n_skipped = 0
        for i in atlas_keys:
            name = key_to_atlas[i]
            if name not in atlas_to_swire_norris:
                n_skipped += 1
                continue
            if name not in atlas_to_swire_predictor:
                n_skipped += 1
                continue
            swire_norris = atlas_to_swire_norris[name]
            swire_predictor = atlas_to_swire_predictor[name]
            n_correct += swire_norris == swire_predictor
            n_total += 1
        print(cid.labeller,
              cid.classifier,
              cid.quadrant,
              '{:<20}'.format(cid.dataset_name),
              n_correct,
              n_total,
              n_skipped,
              '{:.02%}'.format(n_correct / n_total),
              sep='\t')
def main():
    swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False)
    swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False)
    (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False)
    cids = list(pipeline.cross_identify_all(swire_names, swire_coords, swire_labels, swire_test_sets, swire_labels[:, 0], field='cdfs'))
    table = astropy.io.ascii.read(pipeline.TABLE_PATH)

    atlas_to_swire_norris = {}
    key_to_atlas = {}
    for row in table:
        name = row['Component Name (Franzen)']
        key_to_atlas[row['Key']] = name
        swire = row['Source SWIRE (Norris)']
        if not swire or not swire.startswith('SWIRE') or not name:
            continue
        atlas_to_swire_norris[name] = swire

    print('Labeller\tClassifier\tQuadrant\tDataset\tn_correct\tn_total\tn_skipped\tAccuracy')
    for cid in cids:
        atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
        # For each ATLAS object in RGZ & Norris...
        atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'], cid.quadrant].nonzero()[0]
        n_total = 0
        n_correct = 0
        n_skipped = 0
        for i in atlas_keys:
            name = key_to_atlas[i]
            if name not in atlas_to_swire_norris:
                n_skipped += 1
                continue
            if name not in atlas_to_swire_predictor:
                n_skipped += 1
                continue
            swire_norris = atlas_to_swire_norris[name]
            swire_predictor = atlas_to_swire_predictor[name]
            n_correct += swire_norris == swire_predictor
            n_total += 1
        print(cid.labeller, cid.classifier, cid.quadrant, '{:<20}'.format(cid.dataset_name),
              n_correct, n_total, n_skipped, '{:.02%}'.format(n_correct / n_total),
              sep='\t')
Ejemplo n.º 3
0
def print_table(field='cdfs'):
    titlemap = {
        'RGZ & Norris & compact': 'Compact',
        'RGZ & Norris & resolved': 'Resolved',
        'RGZ & Norris': 'All',
        'RGZ & compact': 'Compact',
        'RGZ & resolved': 'Resolved',
        'RGZ': 'All',
    }

    lr_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'LogisticRegression_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'LogisticRegression_rgz_{}_predictions'.format(field)))
    rf_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'RandomForestClassifier_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'RandomForestClassifier_rgz_{}_predictions'.format(field)))
    cnn_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'CNN_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'CNN_rgz_{}_predictions'.format(field)))

    swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field=field)
    swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field)
    _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field=field)

    swire_names = numpy.array(swire_names)
    swire_coords = numpy.array(swire_coords)

    predictions_map = collections.defaultdict(dict) # SWIRE -> predictor -> probability
    swire_coords_map = {}
    swire_expert_map = {}
    swire_rgz_map = {}
    known_predictors = set()

    for classifier, predictions_ in [['LR', lr_predictions], ['CNN', cnn_predictions], ['RF', rf_predictions]]:
        for predictions in predictions_:
            dataset_name = predictions.dataset_name
            labeller = predictions.labeller
            if labeller == 'rgz' and 'Norris' in dataset_name:
                labeller = 'RGZ N'
                continue
            labeller = labeller.title() if labeller == 'norris' else labeller.upper()
            predictor_name = '{}({} / {})'.format(classifier, labeller, titlemap[dataset_name])
            if field == 'cdfs':
                swire_names_ = swire_names[swire_test_sets[:, pipeline.SET_NAMES['RGZ'], predictions.quadrant]]
                swire_coords_ = swire_coords[swire_test_sets[:, pipeline.SET_NAMES['RGZ'], predictions.quadrant]]
                swire_labels_ = swire_labels[swire_test_sets[:, pipeline.SET_NAMES['RGZ'], predictions.quadrant]]
            else:
                swire_names_ = swire_names[swire_test_sets[:, 0, 0]]
                swire_coords_ = swire_coords[swire_test_sets[:, 0, 0]]
                swire_labels_ = swire_labels[swire_test_sets[:, 0, 0]]
            assert predictions.probabilities.shape[0] == len(swire_names_), \
                'expected {}, got {}'.format(predictions.probabilities.shape[0], len(swire_names_))
            for name, coords, prediction, label in zip(swire_names_, swire_coords_, predictions.probabilities, swire_labels_):
                predictions_map[name][predictor_name] = prediction
                swire_coords_map[name] = coords
                swire_expert_map[name] = label[0]
                swire_rgz_map[name] = label[1]
            known_predictors.add(predictor_name)

    known_predictors = sorted(known_predictors)

    swires = sorted(predictions_map)
    ras = []
    decs = []
    is_expert_host = []
    is_rgz_host = []
    predictor_columns = collections.defaultdict(list)
    for swire in swires:
        for predictor in known_predictors:
            predictor_columns[predictor].append(predictions_map[swire].get(predictor, ''))
        ras.append(swire_coords_map[swire][0])
        decs.append(swire_coords_map[swire][1])
        is_expert_host.append(['no', 'yes'][swire_expert_map[swire]])
        is_rgz_host.append(['no', 'yes'][swire_rgz_map[swire]])

    table = astropy.table.Table(
        data=[swires, ras, decs, is_expert_host, is_rgz_host] + [predictor_columns[p] for p in known_predictors],
        names=['SWIRE', 'RA', 'Dec', 'Expert host', 'RGZ host'] + known_predictors)
    table.write('/Users/alger/data/Crowdastro/predicted_swire_table_{}_21_03_18.csv'.format(field), format='csv')
    for p in known_predictors:
        table[p].format = '{:.4f}'
    table.write('/Users/alger/data/Crowdastro/predicted_swire_table_{}_21_03_18.tex'.format(field), format='latex')
def main(examples=None, classifier='CNN', labeller='Norris'):
    # Load SWIRE stuff.
    swire_names, swire_coords, swire_features = pipeline.generate_swire_features(overwrite=False)
    swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False)
    _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, overwrite=False)
    swire_tree = KDTree(swire_coords)
    swire_name_to_index = {n: i for i, n in enumerate(swire_names)}
    # Load ATLAS coords.
    table = astropy.io.ascii.read(pipeline.TABLE_PATH)
    atlas_to_coords = {}
    atlas_to_swire_coords = {}
    for row in table:
        name = row['Component Name (Franzen)']
        if not name:
            continue

        atlas_to_coords[name] = row['Component RA (Franzen)'], row['Component DEC (Franzen)']
        index = swire_name_to_index.get(row['Source SWIRE (Norris)'] or '')
        if index:
            atlas_to_swire_coords[name] = swire_coords[index]

    ir_stretch = astropy.visualization.LogStretch(0.001)
    if examples is None:
        examples = examples_incorrect.get_examples()
        examples = examples[labeller, classifier, 'All']
    for example in examples:
        print('Plotting {}'.format(example))
        predictor_name = '{}_{}'.format(classifier, labeller)
        cid = example[2]
        # Load FITS stuff.
        try:
            radio_fits = astropy.io.fits.open(CDFS_PATH + cid + '_radio.fits')
        except FileNotFoundError:
            if example[1]:  # Has Zooniverse ID
                print('{} not in RGZ'.format(cid))
            continue
        ir_fits = astropy.io.fits.open(CDFS_PATH + cid + '_ir.fits')
        wcs = astropy.wcs.WCS(radio_fits[0].header)
        # Compute info for contour levels. (also from Enno Middelberg)
        median = numpy.median(radio_fits[0].data)
        mad = numpy.median(numpy.abs(radio_fits[0].data - median))
        sigma = mad / mad2sigma
        # Set up the plot.
        fig = plt.figure()
        ax = astropy.visualization.wcsaxes.WCSAxes(
            fig, [0.1, 0.1, 0.8, 0.8], wcs=wcs)
        fig.add_axes(ax)
        ax.set_title('{}'.format(example[0], example[1]))
        # Show the infrared.
        ax.imshow(ir_stretch(ir_fits[0].data), cmap='cubehelix_r',
                  origin='lower')
        # Show the radio.
        ax.contour(radio_fits[0].data, colors='black',
                    levels=[nsig * sigma * sigmult ** i for i in range(15)],
                    linewidths=1, origin='lower', zorder=1)
        # Plot predictions.
        predictions = get_predictions(swire_tree, swire_coords, swire_test_sets, atlas_to_coords[example[0]], predictor_name)
        if not predictions:
            print('No predictions for {}'.format(example[0]))
            continue
        coords = [p[0] for p in predictions]
        probabilities = [p[1] for p in predictions]
        coords = wcs.all_world2pix(coords, 1)
        ax.scatter(coords[:, 0], coords[:, 1], s=numpy.sqrt(numpy.array(probabilities)) * 200, color='white', edgecolor='black', linewidth=1, alpha=0.9, marker='o', zorder=2)
        choice = numpy.argmax(probabilities)
        ax.scatter(coords[choice, 0], coords[choice, 1], s=200 / numpy.sqrt(2), color='blue', marker='x', zorder=2.5)
        try:
            norris_coords, = wcs.all_world2pix([atlas_to_swire_coords[example[0]]], 1)
        except KeyError:
            print('No Norris cross-identification for {}'.format(example[0]))
            continue
        ax.scatter(norris_coords[0], norris_coords[1], marker='+', s=200, zorder=3, color='green')
        lon, lat = ax.coords
        lon.set_major_formatter('hh:mm:ss')
        lon.set_axislabel('Right Ascension')
        lat.set_axislabel('Declination')
        fn = '{}_{}_{}'.format(classifier, labeller, example[0])
        plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/examples/' + fn + '.png',
            bbox_inches='tight', pad_inches=0)
        plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/examples/' + fn + '.pdf',
            bbox_inches='tight', pad_inches=0)
        plt.clf()
def plot_predictions(cut=0.95,
                     labeller='norris',
                     dataset_name=None,
                     classifier=None):
    """Plot colour-colour diagram for predicted host galaxies.

    labeller in {'norris', 'rgz'}
    dataset_name in {'RGZ & Norris', ...}
    """

    with h5py.File(CROWDASTRO_PATH, 'r') as f:
        swire_numeric_cdfs = f['/swire/cdfs/numeric'][:, 2:2 + 4]

    f_36 = swire_numeric_cdfs[:, 0]
    f_45 = swire_numeric_cdfs[:, 1]
    f_58 = swire_numeric_cdfs[:, 2]
    f_80 = swire_numeric_cdfs[:, 3]
    detection_58 = (f_58 != -99)
    detection_80 = (f_80 != -99)

    p = pipeline.unserialise_predictions(
        pipeline.WORKING_DIR +
        '{}_{}_cdfs_predictions'.format(classifier, labeller))
    predictions = {}
    for i in p:
        predictions[i.dataset_name, i.quadrant] = i

    swire_names, swire_coords, _ = pipeline.generate_swire_features(
        overwrite=False, field='cdfs')
    swire_labels = pipeline.generate_swire_labels(swire_names,
                                                  swire_coords,
                                                  overwrite=False,
                                                  field='cdfs')
    _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords,
                                                          swire_labels,
                                                          overwrite=False,
                                                          field='cdfs')

    xs = []
    ys = []
    colours = []
    for q in range(4):
        swire_set = swire_test_sets[:, pipeline.SET_NAMES['RGZ'], q]
        if labeller == 'norris' and not dataset_name:
            # predictions_set = predictions['RGZ & Norris', q].probabilities > cut
            f_36_ = f_36[swire_set & swire_labels[:, 0]]  #[predictions_set]
            f_45_ = f_45[swire_set & swire_labels[:, 0]]  #[predictions_set]
            f_58_ = f_58[swire_set & swire_labels[:, 0]]  #[predictions_set]
            f_80_ = f_80[swire_set & swire_labels[:, 0]]  #[predictions_set]
        elif labeller == 'rgz' and not dataset_name:
            f_36_ = f_36[swire_set & swire_labels[:, 1]]
            f_45_ = f_45[swire_set & swire_labels[:, 1]]
            f_58_ = f_58[swire_set & swire_labels[:, 1]]
            f_80_ = f_80[swire_set & swire_labels[:, 1]]
        if labeller == 'norris' and dataset_name:
            predictions_set = predictions[dataset_name, q].probabilities > cut
            f_36_ = f_36[swire_set][predictions_set]
            f_45_ = f_45[swire_set][predictions_set]
            f_58_ = f_58[swire_set][predictions_set]
            f_80_ = f_80[swire_set][predictions_set]
            probabilities = predictions[dataset_name,
                                        q].probabilities[predictions_set]
        detection_58_ = (f_58_ != -99)
        detection_80_ = (f_80_ != -99)
        detection_all_ = detection_58_ & detection_80_

        ratio_58_36 = numpy.log10(f_58_[detection_all_] /
                                  f_36_[detection_all_])
        ratio_80_45 = numpy.log10(f_80_[detection_all_] /
                                  f_45_[detection_all_])
        probabilities = probabilities[detection_all_]
        xs.extend(ratio_58_36)
        ys.extend(ratio_80_45)
        colours.extend(probabilities)

    assert len(xs) == len(ys)
    assert len(xs) == len(colours)

    plot_basic()
    if dataset_name:
        plt.scatter(xs,
                    ys,
                    s=20,
                    marker='^',
                    linewidth=0,
                    alpha=0.5,
                    c=numpy.array(colours),
                    cmap='winter')
    else:
        plt.scatter(xs, ys, s=25, c='r', marker='^', linewidth=0)
    plt.xlim((-0.75, 1.0))
    plt.ylim((-0.75, 1.0))
    plt.xlabel('$\\log_{10}(S_{5.8}/S_{3.6})$')
    plt.ylabel('$\\log_{10}(S_{8.0}/S_{4.5})$')
    plt.subplots_adjust(left=0.2, bottom=0.15, right=0.95, top=0.95)
    plt.colorbar()
    plt.show()
Ejemplo n.º 6
0
def plot_grid(field='cdfs'):
    # Load predictions.
    lr_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'LogisticRegression_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'LogisticRegression_rgz_{}_predictions'.format(field)))
    rf_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'RandomForestClassifier_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'RandomForestClassifier_rgz_{}_predictions'.format(field)))
    cnn_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'CNN_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'CNN_rgz_{}_predictions'.format(field)))

    # Convert to the format we need. e.g. {'RGZ' -> [acc, acc, acc, acc]}
    lr_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    lr_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    rf_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    rf_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    cnn_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    cnn_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    for predictions in lr_predictions:
        dataset_name = predictions.dataset_name
        if predictions.labeller == 'norris':
            lr_norris_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy
        else:
            lr_rgz_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy
    for predictions in rf_predictions:
        dataset_name = predictions.dataset_name
        if predictions.labeller == 'norris':
            rf_norris_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy
        else:
            rf_rgz_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy
    for predictions in cnn_predictions:
        dataset_name = predictions.dataset_name
        if predictions.labeller == 'norris':
            cnn_norris_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy
        else:
            cnn_rgz_accuracies[dataset_name][predictions.quadrant] = predictions.balanced_accuracy

    if field == 'cdfs':
        # Load RGZ cross-identifications and compute a balanced accuracy with them.
        swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field=field)
        swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field)
        (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field=field)
        label_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
        label_norris_accuracies = {sstr: [1] * 4 for sstr in pipeline.SET_NAMES}  # By definition.
        for dataset_name in pipeline.SET_NAMES:
            for quadrant in range(4):
                test_set = swire_test_sets[:, pipeline.SET_NAMES[dataset_name], quadrant]
                predictions = swire_labels[test_set, 1]
                trues = swire_labels[test_set, 0]
                ba = balanced_accuracy(trues, predictions)
                label_rgz_accuracies[dataset_name][quadrant] = ba

    colours = ['grey', 'magenta', 'blue', 'orange']
    markers = ['o', '^', 'x', 's']
    handles = {}
    plt.figure(figsize=(5, 5))

    accuracy_map = defaultdict(lambda: defaultdict(dict))  # For table output.
    output_sets = [
        ('LR', [lr_norris_accuracies, lr_rgz_accuracies]),
        ('CNN', [cnn_norris_accuracies, cnn_rgz_accuracies]),
        ('RF', [rf_norris_accuracies, rf_rgz_accuracies]),
    ]
    if field == 'cdfs':
        output_sets.append(('Labels', [label_norris_accuracies, label_rgz_accuracies]))
    for j, (classifier_name, classifier_set) in enumerate(output_sets):
        for i, set_name in enumerate(norris_labelled_sets):
            if 'compact' not in set_name:  # Skip compact.
                ax = plt.subplot(2, 1, {'RGZ & Norris & resolved': 1, 'RGZ & Norris': 2}[set_name])
                ax.set_ylim((80, 100))
                ax.set_xlim((-0.5, 1.5))
                ax.set_xticks([0, 1])#, 2])
                ax.set_xticklabels(['Norris',
                                    # 'RGZ N',
                                    'RGZ',
                                   ], rotation='horizontal')
                if i == 2:
                    plt.xlabel('Labels')
                plt.ylabel('{}\nBalanced accuracy\n(per cent)'.format(titlemap[set_name]))

                ax.title.set_fontsize(16)
                ax.xaxis.label.set_fontsize(12)
                ax.yaxis.label.set_fontsize(9)
                for tick in ax.get_xticklabels() + ax.get_yticklabels():
                    tick.set_fontsize(10)

                ax.grid(which='major', axis='y', color='#EEEEEE')
            for k in range(4):
                if 'compact' in set_name:
                    continue
                if j != 3:  # !Labels
                    ax.scatter([0 + (j - 1) / 5], classifier_set[0][set_name][k] * 100,
                                color=colours[j], marker=markers[j], linewidth=1, edgecolor='k')
                rgz_offset = ((j - 1.5) / 6) if field == 'cdfs' else (j - 1) / 5
                handles[j] = ax.scatter([1 + rgz_offset],
                           classifier_set[1][fullmap[set_name]][k] * 100,
                           color=colours[j], marker=markers[j], linewidth=1, edgecolor='k')
                # ax.scatter([1 + (j - 1) / 5], classifier_set[1][set_name][k] * 100,
                #            color=colours[j], marker=markers[j], linewidth=1, edgecolor='k')
            # Compute for table.
            for labeller in ['Norris', 'RGZ N', 'RGZ']:
                if labeller == 'Norris':
                    mean = numpy.mean(classifier_set[0][set_name]) * 100
                    stdev = numpy.std(classifier_set[0][set_name]) * 100
                elif labeller == 'RGZ N':
                    continue
                    # mean = numpy.mean(classifier_set[1][set_name]) * 100
                    # stdev = numpy.std(classifier_set[1][set_name]) * 100
                elif labeller == 'RGZ':
                    mean = numpy.mean(classifier_set[1][fullmap[set_name]]) * 100
                    stdev = numpy.std(classifier_set[1][fullmap[set_name]]) * 100
                accuracy_map[labeller][classifier_name][titlemap[set_name]] = '${:.02f} \\pm {:.02f}$'.format(mean, stdev)

    # Assemble table.
    col_labeller = []
    col_classifier = []
    col_compact = []
    col_resolved = []
    col_all = []
    for labeller in ['Norris', 'RGZ N', 'RGZ']:
        if labeller == 'RGZ N':
            continue

        for classifier in ['CNN', 'LR', 'RF'] + ['Labels'] if field == 'cdfs' else []:
            col_labeller.append(labeller)
            col_classifier.append(classifier)
            col_compact.append(accuracy_map[labeller][classifier]['Compact'])
            col_resolved.append(accuracy_map[labeller][classifier]['Resolved'])
            col_all.append(accuracy_map[labeller][classifier]['All'])
    out_table = astropy.table.Table([col_labeller, col_classifier, col_compact, col_resolved, col_all],
                                    names=['Labeller', 'Classifier', "Mean `Compact' accuracy\\\\(per cent)",
                                           "Mean `Resolved' accuracy\\\\(per cent)",
                                           "Mean `All' accuracy\\\\(per cent)"])
    out_table.write('../{}_accuracy_table.tex'.format(field), format='latex')

    plt.figlegend([handles[j] for j in sorted(handles)], ['LR', 'CNN', 'RF'] + (['Labels'] if field == 'cdfs' else []), 'lower center', ncol=4, fontsize=10)
    plt.subplots_adjust(bottom=0.2, hspace=0.25)
    plt.savefig('../images/{}_ba_grid.pdf'.format(field),
                bbox_inches='tight', pad_inches=0)
    plt.savefig('../images/{}_ba_grid.png'.format(field),
                bbox_inches='tight', pad_inches=0)
Ejemplo n.º 7
0
def get_examples():
    titlemap = {
        'RGZ & Norris & compact': 'Compact',
        'RGZ & Norris & resolved': 'Resolved',
        'RGZ & Norris': 'All',
        'RGZ & compact': 'Compact',
        'RGZ & resolved': 'Resolved',
        'RGZ': 'All',
    }

    swire_names, swire_coords, _ = pipeline.generate_swire_features(
        overwrite=False)
    swire_labels = pipeline.generate_swire_labels(swire_names, overwrite=False)
    (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(
        swire_coords, overwrite=False)
    cids = list(
        pipeline.cross_identify_all(swire_names, swire_coords, swire_test_sets,
                                    swire_labels[:, 0]))
    table = astropy.io.ascii.read(pipeline.TABLE_PATH)

    atlas_to_swire_norris = {}
    key_to_atlas = {}
    atlas_to_ras = {}
    atlas_to_decs = {}
    id_to_atlas = {}
    atlas_to_zid = {}
    atlas_to_id = {}
    for row in table:
        name = row['Component Name (Franzen)']
        if not name:
            continue
        id_to_atlas[row['Component ID (Franzen)']] = name
        key_to_atlas[row['Key']] = name
        swire = row['Source SWIRE (Norris)']
        atlas_to_swire_norris[name] = swire
        atlas_to_id[name] = row['Component ID (Franzen)']
        atlas_to_ras[name] = row['Component RA (Franzen)']
        atlas_to_decs[name] = row['Component DEC (Franzen)']
        atlas_to_zid[name] = row['Component Zooniverse ID (RGZ)']

    atlas_to_rgz = {}
    atlas_to_radio_consensus = {}
    atlas_to_ir_consensus = {}
    for row in astropy.io.ascii.read(pipeline.RGZ_PATH):
        name = id_to_atlas[row['atlas_id']]
        atlas_to_radio_consensus[name] = row['consensus.radio_level']
        atlas_to_ir_consensus[name] = row['consensus.ir_level']
        atlas_to_rgz[name] = row['SWIRE.designation']

    cross_identifications = collections.defaultdict(
        dict)  # ATLAS -> labeller -> SWIRE

    for cid in cids:
        if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name:
            continue

        atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
        # For each ATLAS object in RGZ & Norris...
        atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'],
                                     cid.quadrant].nonzero()[0]
        if 'Norris' in cid.dataset_name and cid.labeller == 'rgz':
            labeller = 'RGZ N'
        elif cid.labeller == 'rgz':
            labeller = 'RGZ'
        else:
            labeller = 'Norris'
        for i in atlas_keys:
            name = key_to_atlas[i]
            if name not in atlas_to_swire_norris:
                continue
            if name not in atlas_to_swire_predictor:
                continue
            cross_identifications[name]['Norris'] = atlas_to_swire_norris[name]
            cross_identifications[name]['RGZ'] = atlas_to_rgz.get(name, None)
            cross_identifications[name][
                labeller, cid.classifier,
                titlemap[cid.dataset_name]] = atlas_to_swire_predictor[name]
    """
    For each classifier, pull out examples where:
    - RGZ and Norris agree, but the classifier disagrees.

    Only include RGZ & Norris dataset ("All").
    """
    classifier_to_example = collections.defaultdict(set)
    for atlas, cids_ in cross_identifications.items():
        if cids_['Norris'] != cids_['RGZ']:
            continue

        for classifier, swire in cids_.items():
            if classifier[2] != 'All' or classifier[1] in {
                    'Random', 'Groundtruth'
            }:
                continue

            if swire != cids_['Norris']:
                classifier_to_example[classifier].add(
                    (atlas, atlas_to_zid[atlas], atlas_to_id[atlas]))

    return classifier_to_example
Ejemplo n.º 8
0
def plot(field='cdfs'):
    log.debug('Getting SWIRE, ATLAS features.')
    swire_names, swire_coords, _ = pipeline.generate_swire_features(
        overwrite=False, field=field)
    swire_labels = pipeline.generate_swire_labels(swire_names,
                                                  swire_coords,
                                                  overwrite=False,
                                                  field=field)
    (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(
        swire_coords, swire_labels, overwrite=False, field=field)
    log.debug('Calling cross-identify.')
    cids = list(
        pipeline.cross_identify_all(swire_names,
                                    swire_coords,
                                    swire_labels,
                                    swire_test_sets,
                                    swire_labels[:, 0],
                                    field=field))

    # Also load the nearest-neighbour cross-identifications.
    cids += [
        pipeline.CrossIdentifications.from_hdf5(
            pipeline.WORKING_DIR +
            'NearestNeighbour_{}_cross_ids_{}_RGZ & Norris.h5'.format(
                field, q)) for q in range(4 if field == 'cdfs' else 1)
    ]

    swire_tree = scipy.spatial.KDTree(swire_coords[swire_test_sets[:, 0, 0]])

    failed_coords = []

    if field == 'cdfs':
        table = astropy.io.ascii.read(pipeline.TABLE_PATH)
        rgzcat = astropy.io.ascii.read(pipeline.RGZ_PATH)

        atlas_to_swire_expert = {}
        atlas_to_swire_rgz = {}
        key_to_atlas = {}
        atlas_id_to_name = {}
        is_compact = {}
        for row in table:
            name = row['Component Name (Franzen)']
            key_to_atlas[row['Key']] = name
            swire = row['Source SWIRE (Norris)']
            if not swire or not swire.startswith('SWIRE') or not name:
                continue
            atlas_id_to_name[row['Component ID (Franzen)']] = name
            atlas_to_swire_expert[name] = swire
            is_compact[name] = pipeline.compact_test(row)
        for row in rgzcat:
            swire_name = row['SWIRE.designation']
            if not swire_name or swire_name == '-99':
                continue
            name = atlas_id_to_name.get(row['atlas_id'], None)
            atlas_to_swire_rgz[name] = swire_name
    else:
        atlas_to_swire_expert = {}
        with astropy.io.fits.open(
                pipeline.MIDDELBERG_TABLE4_PATH) as elais_components_fits:
            elais_components = elais_components_fits[1].data
            atlas_cid_to_name = {}
            atlas_names = []  # Indices correspond to table 4 rows.
            atlas_name_to_compact = {}
            for component in elais_components:
                cid = component['CID']
                name = component['ATELAIS']
                atlas_names.append(name)
                atlas_cid_to_name[cid] = name
                row = {
                    'Component S (Franzen)':
                    component['Sint'],  # Fitting in with the CDFS API...
                    'Component S_ERR (Franzen)': component['e_Sint'],
                    'Component Sp (Franzen)': component['Sp'],
                    'Component Sp_ERR (Franzen)': component['e_Sp']
                }
                atlas_name_to_compact[name] = pipeline.compact_test(row)
        with open(pipeline.MIDDELBERG_TABLE5_PATH) as elais_file:
            # Took this code from pipeline.py, probably should make it a function
            lines = [line.split('|') for line in elais_file]
            for line in lines:
                if 'ATELAISJ' not in line[0]:
                    continue

                line_cids = line[1]
                if 'C0' not in line_cids and 'C1' not in line_cids:
                    continue

                line_cids = [cid.strip() for cid in line_cids.split(',')]
                swire_coord_re = re.search(
                    r'SWIRE4J(\d\d)(\d\d)(\d\d\.\d\d)(-\d\d)(\d\d)(\d\d\.\d)',
                    line[2])
                if not swire_coord_re:
                    continue
                swire_coord_list = swire_coord_re.groups()
                coord = astropy.coordinates.SkyCoord(
                    ra='{} {} {}'.format(*swire_coord_list[:3]),
                    dec='{} {} {}'.format(*swire_coord_list[3:]),
                    unit=('hourangle', 'deg'))
                coord = (coord.ra.deg, coord.dec.deg)
                # Nearest SWIRE...
                dist, nearest = swire_tree.query(coord)
                if dist > 5 / 60 / 60:
                    logging.debug(
                        'No SWIRE match found for Middelberg cross-identification {}'
                        .format(line[0]))
                    logging.debug('Nearest is {} ({:.01f} arcsec)'.format(
                        numpy.array(swire_names)[swire_test_sets[:, 0,
                                                                 0]][nearest],
                        dist * 60 * 60))
                    logging.debug('Middelberg: {}'.format(
                        swire_coord_re.group()))
                    failed_coords.append(coord)
                    continue
                name = numpy.array(swire_names)[swire_test_sets[:, 0,
                                                                0]][nearest]
                for cid in line_cids:
                    atlas_to_swire_expert[atlas_cid_to_name[cid]] = name

    labeller_classifier_to_accuracies = collections.defaultdict(list)

    # Augment the CIDs by duplicating the "resolved" cross-ids to make the "all" set.
    resolved_cids_copy = [
        copy.copy(cid) for cid in cids if 'resolved' in cid.dataset_name
    ]
    for cid in resolved_cids_copy:
        cid.dataset_name = cid.dataset_name.replace(' & resolved', '')
    cids.extend(resolved_cids_copy)

    for cid in cids:
        if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name:
            continue

        if cid.classifier in {'Groundtruth', 'Random', 'NearestNeighbour'}:
            # Deal with these later as they are special.
            continue

        atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
        n_total = 0
        n_correct = 0
        n_skipped = 0
        n_compact = 0
        if field == 'cdfs':
            atlas_keys = atlas_test_sets[:, pipeline.
                                         SET_NAMES[whatset[cid.dataset_name]],
                                         cid.quadrant].nonzero()[0]
            # For each ATLAS object in RGZ & Norris...
            for i in atlas_keys:
                name = key_to_atlas[i]
                if name not in atlas_to_swire_expert:
                    n_skipped += 1
                    continue
                if name not in atlas_to_swire_predictor:
                    n_skipped += 1
                    continue
                swire_norris = atlas_to_swire_expert[name]
                swire_predictor = atlas_to_swire_predictor[name]
                n_correct += swire_norris == swire_predictor
                n_total += 1
        else:
            # Only one test set for ELAIS.
            atlas_indices = atlas_test_sets[:, 0, 0].nonzero()[0]
            assert atlas_test_sets.shape[0] == len(atlas_names)
            for index in atlas_indices:
                # Screen resolved here.
                atlas_name = atlas_names[index]
                if atlas_name not in atlas_to_swire_expert:
                    n_skipped += 1
                    continue
                if atlas_name not in atlas_to_swire_predictor:
                    n_skipped += 1
                    continue
                if 'resolved' in cid.dataset_name and atlas_name_to_compact[
                        atlas_name]:
                    n_compact += 1
                    continue
                swire_middelberg = atlas_to_swire_expert[atlas_name]
                swire_predictor = atlas_to_swire_predictor[atlas_name]
                n_correct += swire_middelberg == swire_predictor
                n_total += 1
            # print('Compact: {:.02%}'.format(n_compact / (n_total + n_compact)))
        if 'Norris' in cid.dataset_name and cid.labeller == 'rgz':
            labeller = 'RGZ N'
        elif cid.labeller == 'rgz':
            labeller = 'RGZ'
        else:
            labeller = 'Norris'
        labeller_classifier_to_accuracies[labeller, cid.classifier,
                                          titlemap[cid.dataset_name]].append(
                                              n_correct / n_total)

    # Groundtruth, random, and NN classifiers exist only for the RGZ & Norris set, but we want to test on all subsets.
    # This section duplicates the classifiers and evaluates them on all subsets.
    for cid in cids:
        if cid.classifier not in {'Groundtruth', 'Random', 'NearestNeighbour'}:
            continue

        for dataset_name in [
                'RGZ & Norris', 'RGZ & Norris & resolved',
                'RGZ & Norris & compact'
        ]:
            atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
            n_total = 0
            n_correct = 0
            n_skipped = 0
            if field == 'cdfs':
                # For each ATLAS object in RGZ & Norris...
                atlas_keys = atlas_test_sets[:,
                                             pipeline.SET_NAMES[dataset_name],
                                             cid.quadrant].nonzero()[0]
                for i in atlas_keys:
                    name = key_to_atlas[i]
                    if name not in atlas_to_swire_expert:
                        n_skipped += 1
                        continue
                    if name not in atlas_to_swire_predictor:
                        n_skipped += 1
                        continue
                    swire_norris = atlas_to_swire_expert[name]
                    swire_predictor = atlas_to_swire_predictor[name]
                    n_correct += swire_norris == swire_predictor
                    if cid.classifier == 'NearestNeighbour' and swire_norris != swire_predictor:
                        pass
                    n_total += 1
            else:
                atlas_indices = atlas_test_sets[:, 0, 0].nonzero()[0]
                assert atlas_test_sets.shape[0] == len(atlas_names)
                for index in atlas_indices:
                    # Screen resolved here (because the test sets aren't useful for that for ELAIS)
                    atlas_name = atlas_names[index]
                    if 'resolved' in dataset_name and atlas_name_to_compact[
                            atlas_name]:
                        continue
                    if atlas_name not in atlas_to_swire_expert:
                        n_skipped += 1
                        continue
                    if atlas_name not in atlas_to_swire_predictor:
                        n_skipped += 1
                        continue
                    swire_middelberg = atlas_to_swire_expert[atlas_name]
                    swire_predictor = atlas_to_swire_predictor[atlas_name]
                    n_correct += swire_middelberg == swire_predictor
                    n_total += 1

            if 'Norris' in cid.dataset_name and cid.labeller == 'rgz':
                labeller = 'RGZ N'
            elif cid.labeller == 'rgz':
                labeller = 'RGZ'
            else:
                labeller = 'Norris'
            print(labeller, cid.classifier, titlemap[dataset_name], n_correct,
                  n_total, n_correct / n_total)
            labeller_classifier_to_accuracies[labeller, cid.classifier,
                                              titlemap[dataset_name]].append(
                                                  n_correct / n_total)

    if field == 'cdfs':
        # Compute accuracy for RGZ.
        for dataset_name in pipeline.SET_NAMES:
            for quadrant in range(4):
                # N.B. Disabled using the pipeline for RGZ.
                # # Compact objects are cross-identified in a separate pipeline, which is slow so I don't want to reproduce it here.
                # # So I'll read the compact object cross-identifications from the LR(RGZ) cross-identification set, since it ought
                # # to be the same.
                # corresponding_set, = [cid for cid in cids if cid.quadrant == quadrant
                #                                              and cid.dataset_name == dataset_name
                #                                              and cid.labeller == 'rgz'
                #                                              and cid.classifier == 'LogisticRegression']
                # atlas_to_swire_lr = dict(zip(corresponding_set.radio_names, corresponding_set.ir_names))
                n_total = 0
                n_correct = 0
                n_skipped = 0
                n_compact = 0
                atlas_keys = atlas_test_sets[:, pipeline.
                                             SET_NAMES[whatset[dataset_name]],
                                             quadrant].nonzero()[0]
                # For each ATLAS object in RGZ & Norris...
                for i in atlas_keys:
                    name = key_to_atlas[i]
                    if name not in atlas_to_swire_expert:
                        n_skipped += 1
                        continue
                    if name not in atlas_to_swire_rgz:  # or name not in atlas_to_swire_lr:
                        n_skipped += 1
                        continue
                    if False and is_compact[name]:
                        swire_predictor = atlas_to_swire_lr[name]
                    else:
                        swire_predictor = atlas_to_swire_rgz[name]
                    swire_norris = atlas_to_swire_expert[name]
                    n_correct += swire_norris == swire_predictor
                    n_total += 1
                labeller_classifier_to_accuracies[
                    'RGZ', 'Label',
                    titlemap[dataset_name]].append(n_correct / n_total)

    labeller_classifier_to_accuracy = {}
    labeller_classifier_to_stdev = {}
    for key, accuracies in labeller_classifier_to_accuracies.items():
        print('Best {}:'.format(key), max(accuracies))
        labeller_classifier_to_accuracy[key] = numpy.mean(accuracies)
        labeller_classifier_to_stdev[key] = numpy.std(accuracies)

    random_acc = {
        k[2]: v * 100
        for k, v in labeller_classifier_to_accuracy.items() if k[1] == 'Random'
    }
    random_stdev = {
        k[2]: v * 100
        for k, v in labeller_classifier_to_stdev.items() if k[1] == 'Random'
    }
    best_acc = {
        k[2]: v * 100
        for k, v in labeller_classifier_to_accuracy.items()
        if k[1] == 'Groundtruth'
    }
    best_stdev = {
        k[2]: v * 100
        for k, v in labeller_classifier_to_stdev.items()
        if k[1] == 'Groundtruth'
    }

    print('Best: {} +- {}'.format(best_acc, best_stdev))
    print('Random: {} +- {}'.format(random_acc, random_stdev))

    plt.figure()
    colours = ['grey', 'magenta', 'blue', 'orange', 'grey']
    markers = ['o', '^', 'x', 's', '*']
    handles = {}
    print('Data set & Labeller & Classifier & Mean accuracy (\\%)\\\\')
    for k, set_name in enumerate(norris_labelled_sets[1:]):
        if 'resolved' in set_name:
            # https://github.com/MatthewJA/radio/issues/22
            continue

        k -= 1

        print_set_name = titlemap[set_name]
        ax = plt.subplot(1, 1, 1 + k)  # 22
        print('{} & Norris & Perfect & ${:.02f} \\pm {:.02f}$\\\\'.format(
            print_set_name, best_acc[titlemap[set_name]],
            best_stdev[titlemap[set_name]]))
        print('{} & Norris & Random & ${:.02f} \\pm {:.02f}$\\\\'.format(
            print_set_name, random_acc[titlemap[set_name]],
            random_stdev[titlemap[set_name]]))
        plt.hlines(best_acc[titlemap[set_name]],
                   -0.5,
                   2.5,
                   linestyles='solid',
                   colors='green',
                   linewidth=1,
                   zorder=1)
        plt.fill_between(
            [-1, 2],
            [best_acc[titlemap[set_name]] - best_stdev[titlemap[set_name]]] *
            2,
            [best_acc[titlemap[set_name]] + best_stdev[titlemap[set_name]]] *
            2,
            linestyle='dashed',
            color='green',
            alpha=0.2,
            linewidth=1,
            zorder=1)
        plt.hlines(random_acc[titlemap[set_name]],
                   -0.5,
                   2.5,
                   linestyles='solid',
                   colors='blue',
                   linewidth=1,
                   zorder=1,
                   alpha=0.7)
        plt.fill_between([-1, 2],
                         [
                             random_acc[titlemap[set_name]] -
                             random_stdev[titlemap[set_name]]
                         ] * 2,
                         [
                             random_acc[titlemap[set_name]] +
                             random_stdev[titlemap[set_name]]
                         ] * 2,
                         linestyle='dashed',
                         color='blue',
                         alpha=0.2,
                         linewidth=1,
                         zorder=1)
        for i, labeller in enumerate(['Norris', 'RGZ']):
            for j, classifier in enumerate(
                ['LogisticRegression', 'CNN', 'RandomForestClassifier'] +
                (['Label', 'NearestNeighbour'] if field ==
                 'cdfs' else ['NearestNeighbour'])):

                ys = numpy.array(labeller_classifier_to_accuracies[
                    labeller, classifier, titlemap[set_name]]) * 100
                if classifier != 'NearestNeighbour':
                    x_offset = i + (
                        j - 1
                    ) / 5 if labeller == 'Norris' or field == 'elais' else i + (
                        j - 1.5) / 6
                else:
                    # NN
                    plt.axhline(numpy.mean(ys),
                                color='grey',
                                linestyle='-.',
                                linewidth=1)
                    if field == 'cdfs':
                        plt.fill_between([-1, 2],
                                         [numpy.mean(ys) - numpy.std(ys)] * 2,
                                         [numpy.mean(ys) + numpy.std(ys)] * 2,
                                         color='grey',
                                         linestyle='-.',
                                         alpha=0.2,
                                         linewidth=1)
                    x_offset = 2

                if classifier == 'Label' and labeller == 'RGZ':
                    plt.annotate('{:.1%}'.format(numpy.mean(ys) / 100),
                                 (x_offset, 72.5),
                                 ha='center',
                                 va='bottom')
                    plt.arrow(x_offset,
                              72.5,
                              0,
                              -1.5,
                              head_width=0.05,
                              head_length=1,
                              ec='k',
                              fc='k')

                xs = [x_offset] * len(ys)
                print('{} & {} & {} & ${:.02f} \\pm {:.02f}$\\\\'.format(
                    print_set_name, labeller, classifier, numpy.mean(ys),
                    numpy.std(ys)))
                ax.set_xlim((-0.5, 1.5))
                ax.set_ylim((70, 100))
                ax.set_xticks([0, 1])
                ax.set_xticklabels(['Norris', 'RGZ'])
                ax.set_yticklabels(
                    ['{}\%'.format(x) for x in range(70, 101, 5)])
                handles[j] = plt.scatter(xs,
                                         ys,
                                         color=colours[j],
                                         marker=markers[j],
                                         zorder=2,
                                         edgecolor='k',
                                         linewidth=1)
            # if k == 0:  # 22
            #     plt.xlabel('Labels')
            plt.ylabel('Cross-identification\naccuracy (per cent)'.format(
                titlemap[set_name]))

            # ax.title.set_fontsize(16)
            # ax.xaxis.label.set_fontsize(12)
            # ax.yaxis.label.set_fontsize(9)
            # for tick in ax.get_xticklabels() + ax.get_yticklabels():
            #     tick.set_fontsize(10)

            ax.grid(which='major', axis='y', color='#DDDDDD')

    # Print the table.
    print('\\hline')
    print('Labeller & Classifier & Mean `Resolved\' & Mean `All\'\\')
    print('&& accuracy (per cent) & accuracy (per cent)\\')
    print('\\hline')
    for labeller in ['Norris', 'RGZ']:
        for classifier in [
                'CNN', 'LogisticRegression', 'RandomForestClassifier',
                'Groundtruth', 'Random', 'Label', 'NearestNeighbour'
        ]:
            if labeller == 'RGZ' and classifier in {
                    'Groundtruth', 'Random', 'NearestNeighbour'
            }:
                continue

            if labeller == 'Norris' and classifier == 'Label':
                continue

            print(
                '{} & {} & ${:.1f} \\pm {:.1f}$ & ${:.1f} \\pm {:.1f}$'.format(
                    labeller, classifier,
                    numpy.array(
                        labeller_classifier_to_accuracies[labeller, classifier,
                                                          'Resolved']).mean() *
                    100,
                    numpy.array(
                        labeller_classifier_to_accuracies[labeller, classifier,
                                                          'Resolved']).std() *
                    100,
                    numpy.array(
                        labeller_classifier_to_accuracies[labeller, classifier,
                                                          'All']).mean() * 100,
                    numpy.array(
                        labeller_classifier_to_accuracies[labeller, classifier,
                                                          'All']).std() * 100))
    plt.gca().tick_params(axis='both',
                          which='major',
                          direction='out',
                          length=5)
    plt.gca().tick_params(axis='y', which='minor', direction='out', length=3)
    plt.gca().minorticks_on()

    plt.figlegend([handles[j] for j in sorted(handles)], ['LR', 'CNN', 'RF'] +
                  (['Labels'] if field == 'cdfs' else []),
                  'lower center',
                  ncol=4)
    plt.subplots_adjust(bottom=0.25, hspace=0.25, left=0.3)
    plt.savefig('../images/{}_cross_identification_grid.pdf'.format(field))
    plt.savefig('../images/{}_cross_identification_grid.png'.format(field))
Ejemplo n.º 9
0
def main(classifier='CNN', labeller='Norris'):
    # Load SWIRE stuff.
    swire_names, swire_coords, swire_features = pipeline.generate_swire_features(
        overwrite=False)
    swire_labels = pipeline.generate_swire_labels(swire_names,
                                                  swire_coords,
                                                  overwrite=False)
    _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords,
                                                          overwrite=False)
    swire_tree = KDTree(swire_coords)
    swire_name_to_index = {n: i for i, n in enumerate(swire_names)}

    atlas_names = []
    atlas_compactnesses = []
    atlas_coords = []
    atlas_norris_swire = []

    table = astropy.io.ascii.read(pipeline.TABLE_PATH)
    for row in table:
        name = row['Component Name (Franzen)']
        if not name:
            continue

        if not row['Component Zooniverse ID (RGZ)']:
            continue

        compactness = pipeline.compactness(row)
        atlas_names.append(name)
        atlas_compactnesses.append(compactness)
        atlas_coords.append(
            (row['Component RA (Franzen)'], row['Component DEC (Franzen)']))
        atlas_norris_swire.append(row['Source SWIRE (Norris)'])

    ys = []
    xs_entropy = []
    xs_margin = []
    no_groundtruth = []
    correct = []

    for name, compactness, coords, swire in zip(atlas_names,
                                                atlas_compactnesses,
                                                atlas_coords,
                                                atlas_norris_swire):
        predictor_name = '{}_{}'.format(classifier, labeller)
        predictions = get_predictions(swire_tree, swire_coords,
                                      numpy.array(swire_names),
                                      swire_test_sets, coords, predictor_name)

        if not predictions:
            print('No predictions for {}'.format(name))
            continue

        chosen_swire = predictions[numpy.argmax([p
                                                 for _, p in predictions])][0]
        predictions = [p for _, p in predictions]

        predictions_softmax = [
            numpy.exp(p) / sum(numpy.exp(p) for p in predictions)
            for p in predictions
        ]
        if len(predictions_softmax) == 1:
            entropy_ambiguity = 0
            margin_ambiguity = 0
        else:
            entropy_ambiguity = -sum(p * numpy.log(p)
                                     for p in predictions_softmax if p)
            predictions.sort()
            margin_ambiguity = 1 - (predictions[-1] - predictions[-2])

        ys.append(compactness)
        xs_entropy.append(entropy_ambiguity)
        xs_margin.append(margin_ambiguity)
        no_groundtruth.append(not swire or not swire.startswith('SWIRE'))
        correct.append(swire == chosen_swire)

    ys = numpy.array(ys)
    xs_margin = numpy.array(xs_margin)
    xs_entropy = numpy.array(xs_entropy)
    no_groundtruth = numpy.array(no_groundtruth, dtype=bool)
    correct = numpy.array(correct, dtype=bool)

    print(sum(1 for y in ys if y <= 1))

    plt.subplot(1, 2, 1)
    plt.scatter(xs_margin[no_groundtruth],
                ys[no_groundtruth],
                marker='x',
                color='black',
                alpha=0.05)
    plt.scatter(xs_margin[~no_groundtruth & correct],
                ys[~no_groundtruth & correct],
                marker='x',
                color='blue',
                alpha=0.7)
    plt.scatter(xs_margin[~no_groundtruth & ~correct],
                ys[~no_groundtruth & ~correct],
                marker='x',
                color='magenta',
                alpha=0.7)
    plt.title('Margin')
    plt.xlabel('1 - margin')
    plt.ylabel('$1.3 SNR S / 10 S_p$')
    plt.yscale('log')
    plt.axhline(1, min(xs_margin), max(xs_margin))
    plt.subplot(1, 2, 2)
    plt.scatter(xs_entropy[no_groundtruth],
                ys[no_groundtruth],
                marker='x',
                color='black',
                alpha=0.05)
    plt.scatter(xs_entropy[~no_groundtruth & correct],
                ys[~no_groundtruth & correct],
                marker='x',
                color='blue',
                alpha=0.7)
    plt.scatter(xs_entropy[~no_groundtruth & ~correct],
                ys[~no_groundtruth & ~correct],
                marker='x',
                color='magenta',
                alpha=0.7)
    plt.title('Entropy')
    plt.xlabel('Entropy')
    plt.ylabel('$1.3 SNR S / 10 S_p$')
    plt.yscale('log')
    plt.axhline(1,
                min(xs_entropy),
                max(xs_entropy),
                zorder=-100,
                linestyle='--',
                color='black')
    plt.show()
def plot(field='cdfs'):
    log.debug('Getting SWIRE, ATLAS features.')
    swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field=field)
    swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field)
    (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field=field)
    log.debug('Calling cross-identify.')
    cids = list(pipeline.cross_identify_all(swire_names, swire_coords, swire_labels, swire_test_sets, swire_labels[:, 0], field=field))

    # Also load the nearest-neighbour cross-identifications.
    cids += [pipeline.CrossIdentifications.from_hdf5(
        pipeline.WORKING_DIR + 'NearestNeighbour_{}_cross_ids_{}_RGZ & Norris.h5'.format(field, q)) for q in range(4 if field == 'cdfs' else 1)]

    swire_tree = scipy.spatial.KDTree(swire_coords[swire_test_sets[:, 0, 0]])

    failed_coords = []

    if field == 'cdfs':
        table = astropy.io.ascii.read(pipeline.TABLE_PATH)
        rgzcat = astropy.io.ascii.read(pipeline.RGZ_PATH)

        atlas_to_swire_expert = {}
        atlas_to_swire_rgz = {}
        key_to_atlas = {}
        atlas_id_to_name = {}
        is_compact = {}
        for row in table:
            name = row['Component Name (Franzen)']
            key_to_atlas[row['Key']] = name
            swire = row['Source SWIRE (Norris)']
            if not swire or not swire.startswith('SWIRE') or not name:
                continue
            atlas_id_to_name[row['Component ID (Franzen)']] = name
            atlas_to_swire_expert[name] = swire
            is_compact[name] = pipeline.compact_test(row)
        for row in rgzcat:
            swire_name = row['SWIRE.designation']
            if not swire_name or swire_name == '-99':
                continue
            name = atlas_id_to_name.get(row['atlas_id'], None)
            atlas_to_swire_rgz[name] = swire_name
    else:
        atlas_to_swire_expert = {}
        with astropy.io.fits.open(pipeline.MIDDELBERG_TABLE4_PATH) as elais_components_fits:
            elais_components = elais_components_fits[1].data
            atlas_cid_to_name = {}
            atlas_names = []  # Indices correspond to table 4 rows.
            atlas_name_to_compact = {}
            for component in elais_components:
                cid = component['CID']
                name = component['ATELAIS']
                atlas_names.append(name)
                atlas_cid_to_name[cid] = name
                row = {'Component S (Franzen)': component['Sint'],  # Fitting in with the CDFS API...
                       'Component S_ERR (Franzen)': component['e_Sint'],
                       'Component Sp (Franzen)': component['Sp'],
                       'Component Sp_ERR (Franzen)': component['e_Sp']}
                atlas_name_to_compact[name] = pipeline.compact_test(row)
        with open(pipeline.MIDDELBERG_TABLE5_PATH) as elais_file:
            # Took this code from pipeline.py, probably should make it a function
            lines = [line.split('|') for line in elais_file]
            for line in lines:
                if 'ATELAISJ' not in line[0]:
                    continue

                line_cids = line[1]
                if 'C0' not in line_cids and 'C1' not in line_cids:
                    continue

                line_cids = [cid.strip() for cid in line_cids.split(',')]
                swire_coord_re = re.search(r'SWIRE4J(\d\d)(\d\d)(\d\d\.\d\d)(-\d\d)(\d\d)(\d\d\.\d)', line[2])
                if not swire_coord_re:
                    continue
                swire_coord_list = swire_coord_re.groups()
                coord = astropy.coordinates.SkyCoord(
                    ra='{} {} {}'.format(*swire_coord_list[:3]),
                    dec='{} {} {}'.format(*swire_coord_list[3:]),
                    unit=('hourangle', 'deg'))
                coord = (coord.ra.deg, coord.dec.deg)
                # Nearest SWIRE...
                dist, nearest = swire_tree.query(coord)
                if dist > 5 / 60 / 60:
                    logging.debug('No SWIRE match found for Middelberg cross-identification {}'.format(line[0]))
                    logging.debug('Nearest is {} ({:.01f} arcsec)'.format(numpy.array(swire_names)[swire_test_sets[:, 0, 0]][nearest], dist * 60 * 60))
                    logging.debug('Middelberg: {}'.format(swire_coord_re.group()))
                    failed_coords.append(coord)
                    continue
                name = numpy.array(swire_names)[swire_test_sets[:, 0, 0]][nearest]
                for cid in line_cids:
                    atlas_to_swire_expert[atlas_cid_to_name[cid]] = name

    labeller_classifier_to_accuracies = collections.defaultdict(list)

    # Augment the CIDs by duplicating the "resolved" cross-ids to make the "all" set.
    resolved_cids_copy = [copy.copy(cid) for cid in cids if 'resolved' in cid.dataset_name]
    for cid in resolved_cids_copy:
        cid.dataset_name = cid.dataset_name.replace(' & resolved', '')
    cids.extend(resolved_cids_copy)

    for cid in cids:
        if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name:
            continue

        if cid.classifier in {'Groundtruth', 'Random', 'NearestNeighbour'}:
            # Deal with these later as they are special.
            continue

        atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
        n_total = 0
        n_correct = 0
        n_skipped = 0
        n_compact = 0
        if field == 'cdfs':
            atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES[whatset[cid.dataset_name]], cid.quadrant].nonzero()[0]
            # For each ATLAS object in RGZ & Norris...
            for i in atlas_keys:
                name = key_to_atlas[i]
                if name not in atlas_to_swire_expert:
                    n_skipped += 1
                    continue
                if name not in atlas_to_swire_predictor:
                    n_skipped += 1
                    continue
                swire_norris = atlas_to_swire_expert[name]
                swire_predictor = atlas_to_swire_predictor[name]
                n_correct += swire_norris == swire_predictor
                n_total += 1
        else:
            # Only one test set for ELAIS.
            atlas_indices = atlas_test_sets[:, 0, 0].nonzero()[0]
            assert atlas_test_sets.shape[0] == len(atlas_names)
            for index in atlas_indices:
                # Screen resolved here.
                atlas_name = atlas_names[index]
                if atlas_name not in atlas_to_swire_expert:
                    n_skipped += 1
                    continue
                if atlas_name not in atlas_to_swire_predictor:
                    n_skipped += 1
                    continue
                if 'resolved' in cid.dataset_name and atlas_name_to_compact[atlas_name]:
                    n_compact += 1
                    continue
                swire_middelberg = atlas_to_swire_expert[atlas_name]
                swire_predictor = atlas_to_swire_predictor[atlas_name]
                n_correct += swire_middelberg == swire_predictor
                n_total += 1
            # print('Compact: {:.02%}'.format(n_compact / (n_total + n_compact)))
        if 'Norris' in cid.dataset_name and cid.labeller == 'rgz':
            labeller = 'RGZ N'
        elif cid.labeller == 'rgz':
            labeller = 'RGZ'
        else:
            labeller = 'Norris'
        labeller_classifier_to_accuracies[labeller, cid.classifier, titlemap[cid.dataset_name]].append(n_correct / n_total)

    # Groundtruth, random, and NN classifiers exist only for the RGZ & Norris set, but we want to test on all subsets.
    # This section duplicates the classifiers and evaluates them on all subsets.
    for cid in cids:
        if cid.classifier not in {'Groundtruth', 'Random', 'NearestNeighbour'}:
            continue

        for dataset_name in ['RGZ & Norris', 'RGZ & Norris & resolved', 'RGZ & Norris & compact']:
            atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
            n_total = 0
            n_correct = 0
            n_skipped = 0
            if field == 'cdfs':
                # For each ATLAS object in RGZ & Norris...
                atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES[dataset_name], cid.quadrant].nonzero()[0]
                for i in atlas_keys:
                    name = key_to_atlas[i]
                    if name not in atlas_to_swire_expert:
                        n_skipped += 1
                        continue
                    if name not in atlas_to_swire_predictor:
                        n_skipped += 1
                        continue
                    swire_norris = atlas_to_swire_expert[name]
                    swire_predictor = atlas_to_swire_predictor[name]
                    n_correct += swire_norris == swire_predictor
                    if cid.classifier == 'NearestNeighbour' and swire_norris != swire_predictor:
                        pass
                    n_total += 1
            else:
                atlas_indices = atlas_test_sets[:, 0, 0].nonzero()[0]
                assert atlas_test_sets.shape[0] == len(atlas_names)
                for index in atlas_indices:
                    # Screen resolved here (because the test sets aren't useful for that for ELAIS)
                    atlas_name = atlas_names[index]
                    if 'resolved' in dataset_name and atlas_name_to_compact[atlas_name]:
                        continue
                    if atlas_name not in atlas_to_swire_expert:
                        n_skipped += 1
                        continue
                    if atlas_name not in atlas_to_swire_predictor:
                        n_skipped += 1
                        continue
                    swire_middelberg = atlas_to_swire_expert[atlas_name]
                    swire_predictor = atlas_to_swire_predictor[atlas_name]
                    n_correct += swire_middelberg == swire_predictor
                    n_total += 1

            if 'Norris' in cid.dataset_name and cid.labeller == 'rgz':
                labeller = 'RGZ N'
            elif cid.labeller == 'rgz':
                labeller = 'RGZ'
            else:
                labeller = 'Norris'
            print(labeller, cid.classifier, titlemap[dataset_name], n_correct, n_total, n_correct / n_total)
            labeller_classifier_to_accuracies[labeller, cid.classifier, titlemap[dataset_name]].append(n_correct / n_total)

    if field == 'cdfs':
        # Compute accuracy for RGZ.
        for dataset_name in pipeline.SET_NAMES:
            for quadrant in range(4):
                # N.B. Disabled using the pipeline for RGZ.
                # # Compact objects are cross-identified in a separate pipeline, which is slow so I don't want to reproduce it here.
                # # So I'll read the compact object cross-identifications from the LR(RGZ) cross-identification set, since it ought
                # # to be the same.
                # corresponding_set, = [cid for cid in cids if cid.quadrant == quadrant
                #                                              and cid.dataset_name == dataset_name
                #                                              and cid.labeller == 'rgz'
                #                                              and cid.classifier == 'LogisticRegression']
                # atlas_to_swire_lr = dict(zip(corresponding_set.radio_names, corresponding_set.ir_names))
                n_total = 0
                n_correct = 0
                n_skipped = 0
                n_compact = 0
                atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES[whatset[dataset_name]], quadrant].nonzero()[0]
                # For each ATLAS object in RGZ & Norris...
                for i in atlas_keys:
                    name = key_to_atlas[i]
                    if name not in atlas_to_swire_expert:
                        n_skipped += 1
                        continue
                    if name not in atlas_to_swire_rgz:# or name not in atlas_to_swire_lr:
                        n_skipped += 1
                        continue
                    if False and is_compact[name]:
                        swire_predictor = atlas_to_swire_lr[name]
                    else:
                        swire_predictor = atlas_to_swire_rgz[name]
                    swire_norris = atlas_to_swire_expert[name]
                    n_correct += swire_norris == swire_predictor
                    n_total += 1
                labeller_classifier_to_accuracies['RGZ', 'Label', titlemap[dataset_name]].append(n_correct / n_total)

    labeller_classifier_to_accuracy = {}
    labeller_classifier_to_stdev = {}
    for key, accuracies in labeller_classifier_to_accuracies.items():
        print('Best {}:'.format(key), max(accuracies))
        labeller_classifier_to_accuracy[key] = numpy.mean(accuracies)
        labeller_classifier_to_stdev[key] = numpy.std(accuracies)

    random_acc = {k[2]: v * 100
                  for k, v in labeller_classifier_to_accuracy.items()
                  if k[1] == 'Random'}
    random_stdev = {k[2]: v * 100
                    for k, v in labeller_classifier_to_stdev.items()
                    if k[1] == 'Random'}
    best_acc = {k[2]: v * 100
                for k, v in labeller_classifier_to_accuracy.items()
                if k[1] == 'Groundtruth'}
    best_stdev = {k[2]: v * 100
                  for k, v in labeller_classifier_to_stdev.items()
                  if k[1] == 'Groundtruth'}

    print('Best: {} +- {}'.format(best_acc, best_stdev))
    print('Random: {} +- {}'.format(random_acc, random_stdev))

    plt.figure()
    colours = ['grey', 'magenta', 'blue', 'orange', 'grey']
    markers = ['o', '^', 'x', 's', '*']
    handles = {}
    print('Data set & Labeller & Classifier & Mean accuracy (\\%)\\\\')
    for k, set_name in enumerate(norris_labelled_sets[1:]):
        if 'resolved' in set_name:
            # https://github.com/MatthewJA/radio/issues/22
            continue

        k -= 1

        print_set_name = titlemap[set_name]
        ax = plt.subplot(1, 1, 1 + k)  # 22
        print('{} & Norris & Perfect & ${:.02f} \\pm {:.02f}$\\\\'.format(print_set_name, best_acc[titlemap[set_name]], best_stdev[titlemap[set_name]]))
        print('{} & Norris & Random & ${:.02f} \\pm {:.02f}$\\\\'.format(print_set_name, random_acc[titlemap[set_name]], random_stdev[titlemap[set_name]]))
        plt.hlines(best_acc[titlemap[set_name]], -0.5, 2.5, linestyles='solid', colors='green', linewidth=1, zorder=1)
        plt.fill_between([-1, 2],
            [best_acc[titlemap[set_name]] - best_stdev[titlemap[set_name]]] * 2,
            [best_acc[titlemap[set_name]] + best_stdev[titlemap[set_name]]] * 2,
            linestyle='dashed', color='green', alpha=0.2, linewidth=1, zorder=1)
        plt.hlines(random_acc[titlemap[set_name]], -0.5, 2.5, linestyles='solid', colors='blue', linewidth=1, zorder=1, alpha=0.7)
        plt.fill_between([-1, 2],
            [random_acc[titlemap[set_name]] - random_stdev[titlemap[set_name]]] * 2,
            [random_acc[titlemap[set_name]] + random_stdev[titlemap[set_name]]] * 2,
            linestyle='dashed', color='blue', alpha=0.2, linewidth=1, zorder=1)
        for i, labeller in enumerate(['Norris', 'RGZ']):
            for j, classifier in enumerate(['LogisticRegression', 'CNN', 'RandomForestClassifier'] + (['Label', 'NearestNeighbour'] if field == 'cdfs' else ['NearestNeighbour'])):

                ys = numpy.array(labeller_classifier_to_accuracies[labeller, classifier, titlemap[set_name]]) * 100
                if classifier != 'NearestNeighbour':
                    x_offset = i + (j - 1) / 5 if labeller == 'Norris' or field == 'elais' else i + (j - 1.5) / 6
                else:
                    # NN
                    plt.axhline(numpy.mean(ys), color='grey', linestyle='-.', linewidth=1)
                    if field == 'cdfs':
                        plt.fill_between([-1, 2], [numpy.mean(ys) - numpy.std(ys)] * 2, [numpy.mean(ys) + numpy.std(ys)] * 2, color='grey', linestyle='-.', alpha=0.2, linewidth=1)
                    x_offset = 2

                if classifier == 'Label' and labeller == 'RGZ':
                    plt.annotate('{:.1%}'.format(numpy.mean(ys) / 100), (x_offset, 72.5), ha='center', va='bottom')
                    plt.arrow(x_offset, 72.5, 0, -1.5, head_width=0.05, head_length=1, ec='k', fc='k')

                xs = [x_offset] * len(ys)
                print('{} & {} & {} & ${:.02f} \\pm {:.02f}$\\\\'.format(print_set_name, labeller, classifier, numpy.mean(ys), numpy.std(ys)))
                ax.set_xlim((-0.5, 1.5))
                ax.set_ylim((70, 100))
                ax.set_xticks([0, 1])
                ax.set_xticklabels(['Norris', 'RGZ'])
                ax.set_yticklabels(['{}\%'.format(x) for x in range(70, 101, 5)])
                handles[j] = plt.scatter(xs, ys, color=colours[j], marker=markers[j], zorder=2, edgecolor='k', linewidth=1)
            # if k == 0:  # 22
            #     plt.xlabel('Labels')
            plt.ylabel('Cross-identification\naccuracy (per cent)'.format(titlemap[set_name]))

            # ax.title.set_fontsize(16)
            # ax.xaxis.label.set_fontsize(12)
            # ax.yaxis.label.set_fontsize(9)
            # for tick in ax.get_xticklabels() + ax.get_yticklabels():
            #     tick.set_fontsize(10)

            ax.grid(which='major', axis='y', color='#DDDDDD')

    # Print the table.
    print('\\hline')
    print('Labeller & Classifier & Mean `Resolved\' & Mean `All\'\\')
    print('&& accuracy (per cent) & accuracy (per cent)\\')
    print('\\hline')
    for labeller in ['Norris', 'RGZ']:
        for classifier in ['CNN', 'LogisticRegression', 'RandomForestClassifier', 'Groundtruth', 'Random', 'Label', 'NearestNeighbour']:
            if labeller == 'RGZ' and classifier in {'Groundtruth', 'Random', 'NearestNeighbour'}:
                continue

            if labeller == 'Norris' and classifier == 'Label':
                continue

            print('{} & {} & ${:.1f} \\pm {:.1f}$ & ${:.1f} \\pm {:.1f}$'.format(
                labeller, classifier,
                numpy.array(
                    labeller_classifier_to_accuracies[labeller, classifier, 'Resolved']).mean() * 100,
                numpy.array(
                    labeller_classifier_to_accuracies[labeller, classifier, 'Resolved']).std() * 100,
                numpy.array(
                    labeller_classifier_to_accuracies[labeller, classifier, 'All']).mean() * 100,
                numpy.array(
                    labeller_classifier_to_accuracies[labeller, classifier, 'All']).std() * 100))
    plt.gca().tick_params(axis='both', which='major', direction='out', length=5)
    plt.gca().tick_params(axis='y', which='minor', direction='out', length=3)
    plt.gca().minorticks_on()

    plt.figlegend([handles[j] for j in sorted(handles)], ['LR', 'CNN', 'RF'] + (['Labels'] if field == 'cdfs' else []), 'lower center', ncol=4)
    plt.subplots_adjust(bottom=0.25, hspace=0.25, left=0.3)
    plt.savefig('../images/{}_cross_identification_grid.pdf'.format(field))
    plt.savefig('../images/{}_cross_identification_grid.png'.format(field))
def print_table(field='cdfs'):
    swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field=field)
    swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field=field)
    (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field=field)
    cids = list(pipeline.cross_identify_all(swire_names, swire_coords, swire_labels, swire_test_sets, swire_labels[:, 0], field=field))

    atlas_to_swire = collections.defaultdict(dict)  # ATLAS -> predictor -> SWIRE

    swire_name_to_coord = {}
    for name, coord in zip(swire_names, swire_coords):
        swire_name_to_coord[name] = coord

    atlas_to_swire_expert = {}
    key_to_atlas = {}
    atlas_to_ras = {}
    atlas_to_decs = {}
    id_to_atlas = {}
    atlas_to_id = {}
    atlas_to_zooniverse_id = {}
    if field == 'cdfs':
        table = astropy.io.ascii.read(pipeline.TABLE_PATH)
        for row in table:
            name = row['Component Name (Franzen)']
            if not name:
                continue
            id_to_atlas[row['Component ID (Franzen)']] = name
            atlas_to_id[name] = row['Component ID (Franzen)']
            atlas_to_zooniverse_id[name] = row['Component Zooniverse ID (RGZ)']
            key_to_atlas[row['Key']] = name
            swire = row['Source SWIRE (Norris)']
            atlas_to_swire_expert[name] = swire
            atlas_to_ras[name] = row['Component RA (Franzen)']
            atlas_to_decs[name] = row['Component DEC (Franzen)']
    else:
        swire_scoords = astropy.coordinates.SkyCoord(ra=swire_coords[:, 0],
                                                     dec=swire_coords[:, 1],
                                                     unit='deg')
        with astropy.io.fits.open(pipeline.MIDDELBERG_TABLE4_PATH) as elais_components_fits:
            elais_components = elais_components_fits[1].data
            component_to_name = {}
            for i, component in enumerate(elais_components):
                name = component['ATELAIS']
                id_to_atlas[component['CID']] = name
                atlas_to_id[name] = component['CID']
                atlas_to_zooniverse_id[name] = ''
                key_to_atlas[i] = name
                coord = astropy.coordinates.SkyCoord(
                    ra='{} {} {}'.format(component['RAh'], component['RAm'], component['RAs']),
                    dec='-{} {} {}'.format(component['DEd'], component['DEm'], component['DEs']),
                    unit=('hourangle', 'deg'))
                coord = (coord.ra.deg, coord.dec.deg)
                atlas_to_ras[name] = coord[0]
                atlas_to_decs[name] = coord[1]
        # Load SWIRE cross-identification from Table 5.
        with open(pipeline.MIDDELBERG_TABLE5_PATH) as elais_file:
            lines = [line.split('|') for line in elais_file]
            for line in lines:
                if 'ATELAISJ' not in line[0]:
                    continue

                line_cids = line[1]
                if 'C0' not in line_cids and 'C1' not in line_cids:
                    continue

                line_cids = [cid.strip() for cid in line_cids.split(',')]
                swire_coord_re = re.search(r'SWIRE4J(\d\d)(\d\d)(\d\d\.\d\d)(-\d\d)(\d\d)(\d\d\.\d)', line[2])
                if not swire_coord_re:
                    continue
                swire_coord_list = swire_coord_re.groups()
                coord = astropy.coordinates.SkyCoord(
                    ra='{} {} {}'.format(*swire_coord_list[:3]),
                    dec='{} {} {}'.format(*swire_coord_list[3:]),
                    unit=('hourangle', 'deg'))
                # Nearest SWIRE...
                seps = coord.separation(swire_scoords)
                nearest = numpy.argmin(seps)
                dist = seps[nearest]
                if dist.deg > 5 / 60 / 60:
                    continue
                name = swire_names[nearest]
                for cid in line_cids:
                    atlas_to_swire_expert[id_to_atlas[cid]] = name

    atlas_to_rgz = {}
    atlas_to_radio_consensus = {}
    atlas_to_ir_consensus = {}
    if field == 'cdfs':
        for row in astropy.io.ascii.read(pipeline.RGZ_PATH):
            name = id_to_atlas[row['atlas_id']]
            atlas_to_radio_consensus[name] = row['consensus.radio_level']
            atlas_to_ir_consensus[name] = row['consensus.ir_level']
            atlas_to_rgz[name] = row['SWIRE.designation']

    titlemap = {
        'RGZ & Norris & compact': 'Compact',
        'RGZ & Norris & resolved': 'Resolved',
        'RGZ & Norris': 'All',
        'RGZ & compact': 'Compact',
        'RGZ & resolved': 'Resolved',
        'RGZ': 'All',
    }

    known_predictors = set()

    for cid in cids:
        if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name:
            continue

        if cid.classifier in {'Groundtruth', 'Random'}:
            continue

        if field == 'cdfs':
            atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'], cid.quadrant].nonzero()[0]
        else:
            atlas_keys = atlas_test_sets[:, 0, 0].nonzero()[0]

        atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
        n_total = 0
        n_correct = 0
        n_skipped = 0
        if 'Norris' in cid.dataset_name and cid.labeller == 'rgz':
            labeller = 'RGZ N'
            continue
        elif cid.labeller == 'rgz':
            labeller = 'RGZ'
        else:
            labeller = 'Norris'
        predictor_name = '{}({} / {})'.format(
            {'LogisticRegression': 'LR', 'CNN': 'CNN', 'RandomForestClassifier': 'RF'}[cid.classifier],
            labeller, titlemap[cid.dataset_name])
        known_predictors.add(predictor_name)
        for i in atlas_keys:
            name = key_to_atlas[i]
            swire_predictor = atlas_to_swire_predictor.get(name, '')
            atlas_to_swire[name][predictor_name] = swire_predictor

    known_predictors = sorted(known_predictors)

    atlases = sorted(atlas_to_swire)
    ras = []
    decs = []
    expert_xids = []
    expert_xid_ras = []
    expert_xid_decs = []
    rgzs = []
    rgz_ras = []
    rgz_decs = []
    rcs = []
    ircs = []
    cids = []
    zids = []
    predictor_columns = collections.defaultdict(list)
    predictor_ras = collections.defaultdict(list)
    predictor_decs = collections.defaultdict(list)
    for atlas in atlases:
        for predictor in known_predictors:
            predictor_columns[predictor].append(atlas_to_swire[atlas].get(predictor, ''))
            predictor_ras[predictor].append(swire_name_to_coord.get(atlas_to_swire[atlas].get(predictor, ''), (None, None))[0])
            predictor_decs[predictor].append(swire_name_to_coord.get(atlas_to_swire[atlas].get(predictor, ''), (None, None))[1])
        ras.append(atlas_to_ras[atlas])
        decs.append(atlas_to_decs[atlas])
        rgzs.append(atlas_to_rgz.get(atlas, ''))
        cids.append(atlas_to_id[atlas])
        zids.append(atlas_to_zooniverse_id[atlas])
        rgz_ras.append(swire_name_to_coord.get(atlas_to_rgz.get(atlas, ''), (None, None))[0])
        rgz_decs.append(swire_name_to_coord.get(atlas_to_rgz.get(atlas, ''), (None, None))[1])
        rcs.append(atlas_to_radio_consensus.get(atlas, 0.0))
        ircs.append(atlas_to_ir_consensus.get(atlas, 0.0))
        expert_xids.append(atlas_to_swire_expert.get(atlas, ''))
        expert_xid_ras.append(swire_name_to_coord.get(atlas_to_swire_expert.get(atlas, ''), (None, None))[0])
        expert_xid_decs.append(swire_name_to_coord.get(atlas_to_swire_expert.get(atlas, ''), (None, None))[1])

    expert = 'Norris' if field == 'cdfs' else 'Middelberg'
    table = astropy.table.Table(
        data=[atlases, ras, decs,
              cids, zids,
              expert_xids, expert_xid_ras, expert_xid_decs,
              rgzs, rgz_ras, rgz_decs,
              rcs, ircs] + [k for p in known_predictors for k in (predictor_columns[p], predictor_ras[p], predictor_decs[p])],
        names=['ATLAS', 'RA', 'Dec',
               'CID', 'Zooniverse ID',
               expert, expert + ' RA', expert + ' Dec',
               'RGZ', 'RGZ RA', 'RGZ Dec',
               'RGZ radio consensus', 'RGZ IR consensus'] + [k for p in known_predictors for k in (p, p + ' RA', p + ' Dec')])
    table['RGZ radio consensus'].format = '{:.4f}'
    table['RGZ IR consensus'].format = '{:.4f}'
    table.write('/Users/alger/data/Crowdastro/predicted_cross_ids_table_21_03_18_{}.csv'.format(field), format='csv')
    table.write('/Users/alger/data/Crowdastro/predicted_cross_ids_table_21_03_18_{}.tex'.format(field), format='latex')
def main(classifier='CNN', labeller='Norris'):
    # Load SWIRE stuff.
    swire_names, swire_coords, swire_features = pipeline.generate_swire_features(overwrite=False)
    swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False)
    _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, overwrite=False)
    swire_tree = KDTree(swire_coords)
    swire_name_to_index = {n: i for i, n in enumerate(swire_names)}

    atlas_names = []
    atlas_compactnesses = []
    atlas_coords = []
    atlas_norris_swire = []

    table = astropy.io.ascii.read(pipeline.TABLE_PATH)
    for row in table:
        name = row['Component Name (Franzen)']
        if not name:
            continue

        if not row['Component Zooniverse ID (RGZ)']:
            continue

        compactness = pipeline.compactness(row)
        atlas_names.append(name)
        atlas_compactnesses.append(compactness)
        atlas_coords.append((row['Component RA (Franzen)'], row['Component DEC (Franzen)']))
        atlas_norris_swire.append(row['Source SWIRE (Norris)'])

    ys = []
    xs_entropy = []
    xs_margin = []
    no_groundtruth = []
    correct = []

    for name, compactness, coords, swire in zip(atlas_names, atlas_compactnesses, atlas_coords, atlas_norris_swire):
        predictor_name = '{}_{}'.format(classifier, labeller)
        predictions = get_predictions(swire_tree, swire_coords, numpy.array(swire_names), swire_test_sets, coords, predictor_name)

        if not predictions:
            print('No predictions for {}'.format(name))
            continue

        chosen_swire = predictions[numpy.argmax([p for _, p in predictions])][0]
        predictions = [p for _, p in predictions]

        predictions_softmax = [numpy.exp(p) / sum(numpy.exp(p) for p in predictions) for p in predictions]
        if len(predictions_softmax) == 1:
            entropy_ambiguity = 0
            margin_ambiguity = 0
        else:
            entropy_ambiguity = -sum(p * numpy.log(p) for p in predictions_softmax if p)
            predictions.sort()
            margin_ambiguity = 1 - (predictions[-1] - predictions[-2])

        ys.append(compactness)
        xs_entropy.append(entropy_ambiguity)
        xs_margin.append(margin_ambiguity)
        no_groundtruth.append(not swire or not swire.startswith('SWIRE'))
        correct.append(swire == chosen_swire)

    ys = numpy.array(ys)
    xs_margin = numpy.array(xs_margin)
    xs_entropy = numpy.array(xs_entropy)
    no_groundtruth = numpy.array(no_groundtruth, dtype=bool)
    correct = numpy.array(correct, dtype=bool)

    print(sum(1 for y in ys if y <= 1))

    plt.subplot(1, 2, 1)
    plt.scatter(xs_margin[no_groundtruth], ys[no_groundtruth], marker='x', color='black', alpha=0.05)
    plt.scatter(xs_margin[~no_groundtruth & correct], ys[~no_groundtruth & correct], marker='x', color='blue', alpha=0.7)
    plt.scatter(xs_margin[~no_groundtruth & ~correct], ys[~no_groundtruth & ~correct], marker='x', color='magenta', alpha=0.7)
    plt.title('Margin')
    plt.xlabel('1 - margin')
    plt.ylabel('$1.3 SNR S / 10 S_p$')
    plt.yscale('log')
    plt.axhline(1, min(xs_margin), max(xs_margin))
    plt.subplot(1, 2, 2)
    plt.scatter(xs_entropy[no_groundtruth], ys[no_groundtruth], marker='x', color='black', alpha=0.05)
    plt.scatter(xs_entropy[~no_groundtruth & correct], ys[~no_groundtruth & correct], marker='x', color='blue', alpha=0.7)
    plt.scatter(xs_entropy[~no_groundtruth & ~correct], ys[~no_groundtruth & ~correct], marker='x', color='magenta', alpha=0.7)
    plt.title('Entropy')
    plt.xlabel('Entropy')
    plt.ylabel('$1.3 SNR S / 10 S_p$')
    plt.yscale('log')
    plt.axhline(1, min(xs_entropy), max(xs_entropy), zorder=-100, linestyle='--', color='black')
    plt.show()
def get_examples():
    titlemap = {
        'RGZ & Norris & compact': 'Compact',
        'RGZ & Norris & resolved': 'Resolved',
        'RGZ & Norris': 'All',
        'RGZ & compact': 'Compact',
        'RGZ & resolved': 'Resolved',
        'RGZ': 'All',
    }

    swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False)
    swire_labels = pipeline.generate_swire_labels(swire_names, overwrite=False)
    (_, atlas_test_sets), (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, overwrite=False)
    cids = list(pipeline.cross_identify_all(swire_names, swire_coords, swire_test_sets, swire_labels[:, 0]))
    table = astropy.io.ascii.read(pipeline.TABLE_PATH)

    atlas_to_swire_norris = {}
    key_to_atlas = {}
    atlas_to_ras = {}
    atlas_to_decs = {}
    id_to_atlas = {}
    atlas_to_zid = {}
    atlas_to_id = {}
    for row in table:
        name = row['Component Name (Franzen)']
        if not name:
            continue
        id_to_atlas[row['Component ID (Franzen)']] = name
        key_to_atlas[row['Key']] = name
        swire = row['Source SWIRE (Norris)']
        atlas_to_swire_norris[name] = swire
        atlas_to_id[name] = row['Component ID (Franzen)']
        atlas_to_ras[name] = row['Component RA (Franzen)']
        atlas_to_decs[name] = row['Component DEC (Franzen)']
        atlas_to_zid[name] = row['Component Zooniverse ID (RGZ)']

    atlas_to_rgz = {}
    atlas_to_radio_consensus = {}
    atlas_to_ir_consensus = {}
    for row in astropy.io.ascii.read(pipeline.RGZ_PATH):
        name = id_to_atlas[row['atlas_id']]
        atlas_to_radio_consensus[name] = row['consensus.radio_level']
        atlas_to_ir_consensus[name] = row['consensus.ir_level']
        atlas_to_rgz[name] = row['SWIRE.designation']

    cross_identifications = collections.defaultdict(dict)  # ATLAS -> labeller -> SWIRE

    for cid in cids:
        if cid.labeller == 'norris' and 'Norris' not in cid.dataset_name:
            continue

        atlas_to_swire_predictor = dict(zip(cid.radio_names, cid.ir_names))
        # For each ATLAS object in RGZ & Norris...
        atlas_keys = atlas_test_sets[:, pipeline.SET_NAMES['RGZ & Norris'], cid.quadrant].nonzero()[0]
        if 'Norris' in cid.dataset_name and cid.labeller == 'rgz':
            labeller = 'RGZ N'
        elif cid.labeller == 'rgz':
            labeller = 'RGZ'
        else:
            labeller = 'Norris'
        for i in atlas_keys:
            name = key_to_atlas[i]
            if name not in atlas_to_swire_norris:
                continue
            if name not in atlas_to_swire_predictor:
                continue
            cross_identifications[name]['Norris'] = atlas_to_swire_norris[name]
            cross_identifications[name]['RGZ'] = atlas_to_rgz.get(name, None)
            cross_identifications[name][labeller, cid.classifier, titlemap[cid.dataset_name]] = atlas_to_swire_predictor[name]

    """
    For each classifier, pull out examples where:
    - RGZ and Norris agree, but the classifier disagrees.

    Only include RGZ & Norris dataset ("All").
    """
    classifier_to_example = collections.defaultdict(set)
    for atlas, cids_ in cross_identifications.items():
        if cids_['Norris'] != cids_['RGZ']:
            continue

        for classifier, swire in cids_.items():
            if classifier[2] != 'All' or classifier[1] in {'Random', 'Groundtruth'}:
                continue

            if swire != cids_['Norris']:
                classifier_to_example[classifier].add((atlas, atlas_to_zid[atlas], atlas_to_id[atlas]))

    return classifier_to_example
def plot_predictions(cut=0.95, labeller='norris', dataset_name=None, classifier=None):
    """Plot colour-colour diagram for predicted host galaxies.

    labeller in {'norris', 'rgz'}
    dataset_name in {'RGZ & Norris', ...}
    """

    with h5py.File(CROWDASTRO_PATH, 'r') as f:
        swire_numeric_cdfs = f['/swire/cdfs/numeric'][:, 2:2 + 4]

    f_36 = swire_numeric_cdfs[:, 0]
    f_45 = swire_numeric_cdfs[:, 1]
    f_58 = swire_numeric_cdfs[:, 2]
    f_80 = swire_numeric_cdfs[:, 3]
    detection_58 = (f_58 != -99)
    detection_80 = (f_80 != -99)

    p = pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + '{}_{}_cdfs_predictions'.format(classifier, labeller))
    predictions = {}
    for i in p:
        predictions[i.dataset_name, i.quadrant] = i


    swire_names, swire_coords, _ = pipeline.generate_swire_features(overwrite=False, field='cdfs')
    swire_labels = pipeline.generate_swire_labels(swire_names, swire_coords, overwrite=False, field='cdfs')
    _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords, swire_labels, overwrite=False, field='cdfs')

    xs = []
    ys = []
    colours = []
    for q in range(4):
        swire_set = swire_test_sets[:, pipeline.SET_NAMES['RGZ'], q]
        if labeller == 'norris' and not dataset_name:
            # predictions_set = predictions['RGZ & Norris', q].probabilities > cut
            f_36_ = f_36[swire_set & swire_labels[:, 0]]#[predictions_set]
            f_45_ = f_45[swire_set & swire_labels[:, 0]]#[predictions_set]
            f_58_ = f_58[swire_set & swire_labels[:, 0]]#[predictions_set]
            f_80_ = f_80[swire_set & swire_labels[:, 0]]#[predictions_set]
        elif labeller == 'rgz' and not dataset_name:
            f_36_ = f_36[swire_set & swire_labels[:, 1]]
            f_45_ = f_45[swire_set & swire_labels[:, 1]]
            f_58_ = f_58[swire_set & swire_labels[:, 1]]
            f_80_ = f_80[swire_set & swire_labels[:, 1]]
        if labeller == 'norris' and dataset_name:
            predictions_set = predictions[dataset_name, q].probabilities > cut
            f_36_ = f_36[swire_set][predictions_set]
            f_45_ = f_45[swire_set][predictions_set]
            f_58_ = f_58[swire_set][predictions_set]
            f_80_ = f_80[swire_set][predictions_set]
            probabilities = predictions[dataset_name, q].probabilities[predictions_set]
        detection_58_ = (f_58_ != -99)
        detection_80_ = (f_80_ != -99)
        detection_all_ = detection_58_ & detection_80_

        ratio_58_36 = numpy.log10(f_58_[detection_all_] / f_36_[detection_all_])
        ratio_80_45 = numpy.log10(f_80_[detection_all_] / f_45_[detection_all_])
        probabilities = probabilities[detection_all_]
        xs.extend(ratio_58_36)
        ys.extend(ratio_80_45)
        colours.extend(probabilities)

    assert len(xs) == len(ys)
    assert len(xs) == len(colours)

    plot_basic()
    if dataset_name:
        plt.scatter(xs, ys, s=20, marker='^', linewidth=0, alpha=0.5, c=numpy.array(colours), cmap='winter')
    else:
        plt.scatter(xs, ys, s=25, c='r', marker='^', linewidth=0)
    plt.xlim((-0.75, 1.0))
    plt.ylim((-0.75, 1.0))
    plt.xlabel('$\\log_{10}(S_{5.8}/S_{3.6})$')
    plt.ylabel('$\\log_{10}(S_{8.0}/S_{4.5})$')
    plt.subplots_adjust(left=0.2, bottom=0.15, right=0.95, top=0.95)
    plt.colorbar()
    plt.show()
Ejemplo n.º 15
0
def plot_grid(field='cdfs'):
    # Load predictions.
    lr_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR +
            'LogisticRegression_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR +
            'LogisticRegression_rgz_{}_predictions'.format(field)))
    rf_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR +
            'RandomForestClassifier_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR +
            'RandomForestClassifier_rgz_{}_predictions'.format(field)))
    cnn_predictions = itertools.chain(
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'CNN_norris_{}_predictions'.format(field)),
        pipeline.unserialise_predictions(
            pipeline.WORKING_DIR + 'CNN_rgz_{}_predictions'.format(field)))

    # Convert to the format we need. e.g. {'RGZ' -> [acc, acc, acc, acc]}
    lr_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    lr_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    rf_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    rf_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    cnn_norris_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    cnn_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
    for predictions in lr_predictions:
        dataset_name = predictions.dataset_name
        if predictions.labeller == 'norris':
            lr_norris_accuracies[dataset_name][
                predictions.quadrant] = predictions.balanced_accuracy
        else:
            lr_rgz_accuracies[dataset_name][
                predictions.quadrant] = predictions.balanced_accuracy
    for predictions in rf_predictions:
        dataset_name = predictions.dataset_name
        if predictions.labeller == 'norris':
            rf_norris_accuracies[dataset_name][
                predictions.quadrant] = predictions.balanced_accuracy
        else:
            rf_rgz_accuracies[dataset_name][
                predictions.quadrant] = predictions.balanced_accuracy
    for predictions in cnn_predictions:
        dataset_name = predictions.dataset_name
        if predictions.labeller == 'norris':
            cnn_norris_accuracies[dataset_name][
                predictions.quadrant] = predictions.balanced_accuracy
        else:
            cnn_rgz_accuracies[dataset_name][
                predictions.quadrant] = predictions.balanced_accuracy

    if field == 'cdfs':
        # Load RGZ cross-identifications and compute a balanced accuracy with them.
        swire_names, swire_coords, _ = pipeline.generate_swire_features(
            overwrite=False, field=field)
        swire_labels = pipeline.generate_swire_labels(swire_names,
                                                      swire_coords,
                                                      overwrite=False,
                                                      field=field)
        (_, atlas_test_sets), (_,
                               swire_test_sets) = pipeline.generate_data_sets(
                                   swire_coords,
                                   swire_labels,
                                   overwrite=False,
                                   field=field)
        label_rgz_accuracies = {sstr: [0] * 4 for sstr in pipeline.SET_NAMES}
        label_norris_accuracies = {
            sstr: [1] * 4
            for sstr in pipeline.SET_NAMES
        }  # By definition.
        for dataset_name in pipeline.SET_NAMES:
            for quadrant in range(4):
                test_set = swire_test_sets[:, pipeline.SET_NAMES[dataset_name],
                                           quadrant]
                predictions = swire_labels[test_set, 1]
                trues = swire_labels[test_set, 0]
                ba = balanced_accuracy(trues, predictions)
                label_rgz_accuracies[dataset_name][quadrant] = ba

    colours = ['grey', 'magenta', 'blue', 'orange']
    markers = ['o', '^', 'x', 's']
    handles = {}
    plt.figure(figsize=(5, 5))

    accuracy_map = defaultdict(lambda: defaultdict(dict))  # For table output.
    output_sets = [
        ('LR', [lr_norris_accuracies, lr_rgz_accuracies]),
        ('CNN', [cnn_norris_accuracies, cnn_rgz_accuracies]),
        ('RF', [rf_norris_accuracies, rf_rgz_accuracies]),
    ]
    if field == 'cdfs':
        output_sets.append(
            ('Labels', [label_norris_accuracies, label_rgz_accuracies]))
    for j, (classifier_name, classifier_set) in enumerate(output_sets):
        for i, set_name in enumerate(norris_labelled_sets):
            if 'compact' not in set_name:  # Skip compact.
                ax = plt.subplot(2, 1, {
                    'RGZ & Norris & resolved': 1,
                    'RGZ & Norris': 2
                }[set_name])
                ax.set_ylim((80, 100))
                ax.set_xlim((-0.5, 1.5))
                ax.set_xticks([0, 1])  #, 2])
                ax.set_xticklabels(
                    [
                        'Norris',
                        # 'RGZ N',
                        'RGZ',
                    ],
                    rotation='horizontal')
                if i == 2:
                    plt.xlabel('Labels')
                plt.ylabel('{}\nBalanced accuracy\n(per cent)'.format(
                    titlemap[set_name]))

                ax.title.set_fontsize(16)
                ax.xaxis.label.set_fontsize(12)
                ax.yaxis.label.set_fontsize(9)
                for tick in ax.get_xticklabels() + ax.get_yticklabels():
                    tick.set_fontsize(10)

                ax.grid(which='major', axis='y', color='#EEEEEE')
            for k in range(4):
                if 'compact' in set_name:
                    continue
                if j != 3:  # !Labels
                    ax.scatter([0 + (j - 1) / 5],
                               classifier_set[0][set_name][k] * 100,
                               color=colours[j],
                               marker=markers[j],
                               linewidth=1,
                               edgecolor='k')
                rgz_offset = ((j - 1.5) /
                              6) if field == 'cdfs' else (j - 1) / 5
                handles[j] = ax.scatter(
                    [1 + rgz_offset],
                    classifier_set[1][fullmap[set_name]][k] * 100,
                    color=colours[j],
                    marker=markers[j],
                    linewidth=1,
                    edgecolor='k')
                # ax.scatter([1 + (j - 1) / 5], classifier_set[1][set_name][k] * 100,
                #            color=colours[j], marker=markers[j], linewidth=1, edgecolor='k')
            # Compute for table.
            for labeller in ['Norris', 'RGZ N', 'RGZ']:
                if labeller == 'Norris':
                    mean = numpy.mean(classifier_set[0][set_name]) * 100
                    stdev = numpy.std(classifier_set[0][set_name]) * 100
                elif labeller == 'RGZ N':
                    continue
                    # mean = numpy.mean(classifier_set[1][set_name]) * 100
                    # stdev = numpy.std(classifier_set[1][set_name]) * 100
                elif labeller == 'RGZ':
                    mean = numpy.mean(
                        classifier_set[1][fullmap[set_name]]) * 100
                    stdev = numpy.std(
                        classifier_set[1][fullmap[set_name]]) * 100
                accuracy_map[labeller][classifier_name][
                    titlemap[set_name]] = '${:.02f} \\pm {:.02f}$'.format(
                        mean, stdev)

    # Assemble table.
    col_labeller = []
    col_classifier = []
    col_compact = []
    col_resolved = []
    col_all = []
    for labeller in ['Norris', 'RGZ N', 'RGZ']:
        if labeller == 'RGZ N':
            continue

        for classifier in ['CNN', 'LR', 'RF'] + ['Labels'
                                                 ] if field == 'cdfs' else []:
            col_labeller.append(labeller)
            col_classifier.append(classifier)
            col_compact.append(accuracy_map[labeller][classifier]['Compact'])
            col_resolved.append(accuracy_map[labeller][classifier]['Resolved'])
            col_all.append(accuracy_map[labeller][classifier]['All'])
    out_table = astropy.table.Table(
        [col_labeller, col_classifier, col_compact, col_resolved, col_all],
        names=[
            'Labeller', 'Classifier', "Mean `Compact' accuracy\\\\(per cent)",
            "Mean `Resolved' accuracy\\\\(per cent)",
            "Mean `All' accuracy\\\\(per cent)"
        ])
    out_table.write('../{}_accuracy_table.tex'.format(field), format='latex')

    plt.figlegend([handles[j] for j in sorted(handles)], ['LR', 'CNN', 'RF'] +
                  (['Labels'] if field == 'cdfs' else []),
                  'lower center',
                  ncol=4,
                  fontsize=10)
    plt.subplots_adjust(bottom=0.2, hspace=0.25)
    plt.savefig('../images/{}_ba_grid.pdf'.format(field),
                bbox_inches='tight',
                pad_inches=0)
    plt.savefig('../images/{}_ba_grid.png'.format(field),
                bbox_inches='tight',
                pad_inches=0)
Ejemplo n.º 16
0
def main(examples=None, classifier='CNN', labeller='Norris'):
    # Load SWIRE stuff.
    swire_names, swire_coords, swire_features = pipeline.generate_swire_features(
        overwrite=False)
    swire_labels = pipeline.generate_swire_labels(swire_names,
                                                  swire_coords,
                                                  overwrite=False)
    _, (_, swire_test_sets) = pipeline.generate_data_sets(swire_coords,
                                                          overwrite=False)
    swire_tree = KDTree(swire_coords)
    swire_name_to_index = {n: i for i, n in enumerate(swire_names)}
    # Load ATLAS coords.
    table = astropy.io.ascii.read(pipeline.TABLE_PATH)
    atlas_to_coords = {}
    atlas_to_swire_coords = {}
    for row in table:
        name = row['Component Name (Franzen)']
        if not name:
            continue

        atlas_to_coords[name] = row['Component RA (Franzen)'], row[
            'Component DEC (Franzen)']
        index = swire_name_to_index.get(row['Source SWIRE (Norris)'] or '')
        if index:
            atlas_to_swire_coords[name] = swire_coords[index]

    ir_stretch = astropy.visualization.LogStretch(0.001)
    if examples is None:
        examples = examples_incorrect.get_examples()
        examples = examples[labeller, classifier, 'All']
    for example in examples:
        print('Plotting {}'.format(example))
        predictor_name = '{}_{}'.format(classifier, labeller)
        cid = example[2]
        # Load FITS stuff.
        try:
            radio_fits = astropy.io.fits.open(CDFS_PATH + cid + '_radio.fits')
        except FileNotFoundError:
            if example[1]:  # Has Zooniverse ID
                print('{} not in RGZ'.format(cid))
            continue
        ir_fits = astropy.io.fits.open(CDFS_PATH + cid + '_ir.fits')
        wcs = astropy.wcs.WCS(radio_fits[0].header)
        # Compute info for contour levels. (also from Enno Middelberg)
        median = numpy.median(radio_fits[0].data)
        mad = numpy.median(numpy.abs(radio_fits[0].data - median))
        sigma = mad / mad2sigma
        # Set up the plot.
        fig = plt.figure()
        ax = astropy.visualization.wcsaxes.WCSAxes(fig, [0.1, 0.1, 0.8, 0.8],
                                                   wcs=wcs)
        fig.add_axes(ax)
        ax.set_title('{}'.format(example[0], example[1]))
        # Show the infrared.
        ax.imshow(ir_stretch(ir_fits[0].data),
                  cmap='cubehelix_r',
                  origin='lower')
        # Show the radio.
        ax.contour(radio_fits[0].data,
                   colors='black',
                   levels=[nsig * sigma * sigmult**i for i in range(15)],
                   linewidths=1,
                   origin='lower',
                   zorder=1)
        # Plot predictions.
        predictions = get_predictions(swire_tree, swire_coords,
                                      swire_test_sets,
                                      atlas_to_coords[example[0]],
                                      predictor_name)
        if not predictions:
            print('No predictions for {}'.format(example[0]))
            continue
        coords = [p[0] for p in predictions]
        probabilities = [p[1] for p in predictions]
        coords = wcs.all_world2pix(coords, 1)
        ax.scatter(coords[:, 0],
                   coords[:, 1],
                   s=numpy.sqrt(numpy.array(probabilities)) * 200,
                   color='white',
                   edgecolor='black',
                   linewidth=1,
                   alpha=0.9,
                   marker='o',
                   zorder=2)
        choice = numpy.argmax(probabilities)
        ax.scatter(coords[choice, 0],
                   coords[choice, 1],
                   s=200 / numpy.sqrt(2),
                   color='blue',
                   marker='x',
                   zorder=2.5)
        try:
            norris_coords, = wcs.all_world2pix(
                [atlas_to_swire_coords[example[0]]], 1)
        except KeyError:
            print('No Norris cross-identification for {}'.format(example[0]))
            continue
        ax.scatter(norris_coords[0],
                   norris_coords[1],
                   marker='+',
                   s=200,
                   zorder=3,
                   color='green')
        lon, lat = ax.coords
        lon.set_major_formatter('hh:mm:ss')
        lon.set_axislabel('Right Ascension')
        lat.set_axislabel('Declination')
        fn = '{}_{}_{}'.format(classifier, labeller, example[0])
        plt.savefig(
            '/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/examples/'
            + fn + '.png',
            bbox_inches='tight',
            pad_inches=0)
        plt.savefig(
            '/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/examples/'
            + fn + '.pdf',
            bbox_inches='tight',
            pad_inches=0)
        plt.clf()