def gaia_motion_analysis(data, norm=False, class_col='CLASS_PHOTO'): movement_mask = ~data[['parallax', 'pmdec', 'pmra']].isnull().any(axis=1) data_movement = data.loc[movement_mask] for class_name in BASE_CLASSES: motions = ['parallax', 'pmra', 'pmdec'] if norm & (class_name == 'QSO'): motions = [m + '_norm' for m in motions] result_df = pd.DataFrame( index=['mean', 'sigma', 'mean_error', 'median'], columns=motions) for motion in motions: data_of_interest = data_movement.loc[data_movement[class_col] == class_name, motion] (mu, sigma) = stats.norm.fit(data_of_interest) median = np.median(data_of_interest) mu_error = sigma / math.sqrt(data_of_interest.shape[0]) result_df.loc['mean', motion] = mu result_df.loc['sigma', motion] = sigma result_df.loc['mean_error', motion] = mu_error result_df.loc['median', motion] = median plt.figure() sns.distplot(data_of_interest, color=get_cubehelix_palette(1)[0], kde_kws=dict(bw=0.5)) if motion == 'parallax': plt.xlim((-6, 6)) plt.ylabel(class_name) print('{}:'.format(class_name)) print(result_df)
def plot_z_hists(preds, z_max=None): preds_zlim = preds.loc[preds['Z'] <= z_max] to_plot = [ ('Z', 'CLASS'), ('Z_PHOTO', 'CLASS_PHOTO'), ] color_palette = get_cubehelix_palette(len(BASE_CLASSES)) for x_col, cls_col in to_plot: is_cls_photo = (cls_col == 'CLASS_PHOTO') plt.figure() for i, cls in enumerate(['QSO', 'GALAXY']): hist, bins = np.histogram( preds_zlim.loc[preds_zlim[cls_col] == cls][x_col], bins=40) hist_norm = hist / max(hist) ax = sns.lineplot(bins[:-1], hist_norm, drawstyle='steps-post', label=get_plot_text(cls, is_cls_photo), color=color_palette[i]) ax.lines[i].set_linestyle(get_line_style(i)) plt.xlabel(get_plot_text(x_col)) plt.ylabel('normalized counts per bin') plt.legend(framealpha=1.0) plt.show()
def precision_z_report(predictions, col_true='CLASS', z_max=None): """ Compare predicted classes against true redshifts :param predictions: :param col_true: :param z_max: :return: """ predictions_zlim = predictions.loc[predictions['Z'] <= z_max] color_palette = get_cubehelix_palette(len(BASE_CLASSES)) for cls_pred in BASE_CLASSES: photo_class_as_dict = {} for cls_true in BASE_CLASSES: photo_class_as_dict[cls_true] = predictions_zlim.loc[ (predictions_zlim[col_true] == cls_true) & (predictions_zlim['CLASS_PHOTO'] == cls_pred)]['Z_PHOTO'] plt.figure() _, bin_edges = np.histogram(np.hstack(( photo_class_as_dict['QSO'], photo_class_as_dict['STAR'], photo_class_as_dict['GALAXY'], )), bins=40) for i, cls_true in enumerate(BASE_CLASSES): hist_kws = { 'alpha': 1.0, 'histtype': 'step', 'linewidth': 1.5, 'linestyle': get_line_style(i) } label = '{}'.format(get_plot_text(cls_true)) ax = sns.distplot(photo_class_as_dict[cls_true], label=label, bins=bin_edges, kde=False, rug=False, color=color_palette[i], hist_kws=hist_kws) ax.set(yscale='log') plt.title(get_plot_text(cls_pred, is_photo=True)) plt.xlabel(get_plot_text('Z_PHOTO')) plt.ylabel('counts per bin') plt.legend(loc='upper right', framealpha=1.0) plt.show()
def plot_linear_data(data, annotations=True): color_palette = get_cubehelix_palette(len(data)) for i, (scale, bias, x_lim) in enumerate(data): label_base = '$10^{' + '{}'.format(scale) + ' * m' prefix = '' if annotations: if scale == 0.6: prefix = 'euclidean ' else: prefix = 'eBOSS ' label_base = prefix + label_base x_linear = np.arange(x_lim[0], x_lim[1] + 0.25, 0.25) y_linear = [10**(scale * m - bias) for m in x_linear] v_bias = bias if bias > 0 else -bias label_bias = ' - ' + str(v_bias) + '}$' if bias > 0 else ' + ' + str( v_bias) + '}$' plt.plot(x_linear, y_linear, '--', c=color_palette[i], label=(label_base + label_bias))
def spatial_number_density(data_dict, nside=128, z_bin_step=0.5, z_bin_size=0.5, cosmo_model=cosmo_wmap9, z_max=None, legend_size=None): volume_proportion = (hp.nside2pixarea(nside, degrees=True) / 41253.0) fig, ax = plt.subplots() to_plot_df = pd.DataFrame() x_col = 'z' y_col = r'spatial density [N / comoving Mpc$^3$]' for data_name, (data, map) in data_dict.items(): z_column = 'Z' if 'Z' in data else 'Z_PHOTO' z_half_bin_size = z_bin_size / 2 steps = np.arange(data[z_column].min() + z_half_bin_size, data[z_column].max() + z_half_bin_size, z_bin_step) comoving_volumes = np.array([ (cosmo_model.comoving_volume(step + z_half_bin_size) - cosmo_model.comoving_volume(step - z_half_bin_size)).value for step in steps ]) mask_non_zero = np.nonzero(map) print('{} area: {:.2f} deg^2'.format( data_name, len(mask_non_zero[0]) * hp.nside2pixarea(nside, degrees=True))) density_v_max_mean, density_v_max_error = [], [] (ra_col, dec_col) = ('RAJ2000', 'DECJ2000') if ('RAJ2000' in data) else ('RA', 'DEC') for i, step in enumerate(steps): data_step = data.loc[(data[z_column] > step - z_half_bin_size) & (data[z_column] < step + z_half_bin_size)] step_map, _, _ = get_map(data_step[ra_col], data_step[dec_col], v=data_step['v_weight'].values, nside=nside) v_max_values = step_map[mask_non_zero] / comoving_volumes[ i] / volume_proportion (mu, sigma) = stats.norm.fit(v_max_values) density_v_max_mean.append(mu) density_v_max_error.append(sigma / math.sqrt(v_max_values.shape[0])) density_v_max_mean = np.array(density_v_max_mean) density_v_max_error = np.array(density_v_max_error) # comoving_v_max_densities = (density_v_max_mean / comoving_volumes / volume_proportion) to_plot_df = to_plot_df.append(pd.DataFrame({ x_col: steps, y_col: density_v_max_mean, 'error': density_v_max_error, 'data name': [data_name] * len(steps), }), ignore_index=True) color_palette = get_cubehelix_palette( len(data_dict), reverse=False) if len(data_dict) > 1 else [(0, 0, 0)] sns.lineplot(x=x_col, y=y_col, data=to_plot_df, hue='data name', palette=color_palette, style='data name', markers=True, dashes=False) ax = plt.gca() for i, data_name in enumerate(to_plot_df['data name'].unique()): to_plot_single_data = to_plot_df.loc[to_plot_df['data name'] == data_name] lower = to_plot_single_data[ y_col].values - to_plot_single_data['error'].values / 2 upper = to_plot_single_data[ y_col].values + to_plot_single_data['error'].values / 2 ax.fill_between(to_plot_single_data[x_col], lower, upper, color=color_palette[i], alpha=0.2) plt.xlim(right=z_max) plt.yscale('log') # handles, labels = ax.get_legend_handles_labels() prop = {'size': legend_size} if legend_size else {} ax.legend(loc='upper right', framealpha=1.0, prop=prop) # handles=handles[1:], labels=labels[1:], plt.setp(ax.get_legend().get_texts(), fontsize='9') plt.show()
def number_counts(data_dict, linear_data, nside=128, step=.1, band_column='MAG_GAAP_r', legend_loc='upper left', legend_size=None): fig, ax = plt.subplots() to_plot_df = pd.DataFrame() x_col = pretty_print_magnitude(band_column) y_col = r'surface density (≤ m) [N / deg$^2$]' for i, (data_name, (data, map)) in enumerate(data_dict.items()): (ra_col, dec_col) = ('RAJ2000', 'DECJ2000') if 'RAJ2000' in data else ('RA', 'DEC') mask_non_zero = np.nonzero(map) print('{} area: {:.2f} deg^2'.format( data_name, len(mask_non_zero[0]) * hp.nside2pixarea(nside, degrees=True))) m_min = int(math.ceil(data[band_column].min())) m_max = int(math.ceil(data[band_column].max())) magnitude_arr = np.arange(m_min, m_max + step, step) density_mean_arr, density_error_arr = [], [] for m_max in magnitude_arr: data_m_max = data.loc[data[band_column] < m_max] map_m_max, _, _ = get_map(data_m_max[ra_col], data_m_max[dec_col], nside=nside) densities = map_m_max[mask_non_zero] / hp.nside2pixarea( nside, degrees=True) (mu, sigma) = stats.norm.fit(densities) density_mean_arr.append(mu) density_error_arr.append(sigma / math.sqrt(densities.shape[0])) to_plot_df = to_plot_df.append(pd.DataFrame({ x_col: magnitude_arr, y_col: density_mean_arr, 'error': density_error_arr, 'data name': [data_name] * len(magnitude_arr), }), ignore_index=True) color_palette = get_cubehelix_palette( len(data_dict), reverse=True) if len(data_dict) > 1 else [(0, 0, 0)] sns.lineplot(x=x_col, y=y_col, data=to_plot_df, hue='data name', palette=color_palette, style='data name', markers=True) plot_linear_data(linear_data) ax = plt.gca() for i, data_name in enumerate(to_plot_df['data name'].unique()): to_plot_single_data = to_plot_df.loc[to_plot_df['data name'] == data_name] lower = to_plot_single_data[ y_col].values - to_plot_single_data['error'].values / 2 upper = to_plot_single_data[ y_col].values + to_plot_single_data['error'].values / 2 ax.fill_between(to_plot_single_data[x_col], lower, upper, color=color_palette[i], alpha=0.2) plt.yscale('log') # handles, labels = ax.get_legend_handles_labels() prop = {'size': legend_size} if legend_size else {} ax.legend(loc=legend_loc, framealpha=1.0, prop=prop) # handles=handles[1:], labels=labels[1:], plt.setp(ax.get_legend().get_texts(), fontsize='9') plt.show()
def plot_cleaning_metrics(preds_class, cls, metrics_to_plot, thresholds, step, cleaning, y_lim=None): fig, ax1 = plt.subplots() ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis label = '{} probability threshold' if cleaning == 'clf_proba' else '{} redshift uncertainity threshold' ax1.set_xlabel(label.format(get_plot_text(cls, is_photo=True))) if cleaning == 'z_std_dev': ax1.invert_xaxis() ax_arr = [ax1, ax2] plotted_arr = [] color_palette = get_cubehelix_palette(len(metrics_to_plot)) for i, (metric_name, metric_func) in enumerate(metrics_to_plot): metric_std_func = None if type(metric_func) is tuple: metric_std_func = metric_func[1] metric_func = metric_func[0] # Get metrics in thresholds metric_values = [] metric_errors = [] thresholds_to_use = thresholds if metric_name != 'fraction of objects' else ( np.append(thresholds, [thresholds[-1] + step])) for thr in thresholds_to_use: preds_lim = preds_class.loc[preds_class['{}_PHOTO'.format( cls)] >= thr] if cleaning == 'clf_proba' else preds_class.loc[ preds_class['Z_PHOTO_STDDEV'] <= thr] # Get mean and standard error metric_mean = metric_func(preds_lim['Z'], preds_lim['Z_PHOTO']) metric_error = None if metric_std_func: metric_std = metric_std_func(preds_lim['Z'], preds_lim['Z_PHOTO']) metric_error = metric_std / math.sqrt(preds_lim.shape[0]) metric_values.append(np.around(metric_mean, 4)) metric_errors.append(metric_error) # Make plots plotted, = ax_arr[i].plot(thresholds_to_use, metric_values, label=metric_name, color=color_palette[i], linestyle=get_line_style(i)) if metric_errors[0]: lower = np.array(metric_values) - np.array(metric_errors) / 2 upper = np.array(metric_values) + np.array(metric_errors) / 2 ax_arr[i].fill_between(thresholds_to_use, lower, upper, color=color_palette[i], alpha=0.2) ax_arr[i].tick_params(axis='y', labelcolor=color_palette[i]) ax_arr[i].set_ylabel(metric_name) plotted_arr.append(plotted) ax_arr[1].yaxis.grid(False) fig.tight_layout() # otherwise the right y-label is slightly clipped plt.legend(handles=plotted_arr, loc='lower left', framealpha=1.0) if y_lim: ax_arr[0].set_ylim(y_lim) plt.show() return ax_arr[0].get_ylim()
def proba_motion_analysis(data_x_gaia, motions=None, x_lim=(0.3, 1), step=0.004, mean_y_lines=None): motions = ['parallax'] if motions is None else motions mu_dict, sigma_dict, median_dict, error_dict = defaultdict( list), defaultdict(list), defaultdict(list), defaultdict(list) # Get QSOs qso_x_gaia = data_x_gaia.loc[data_x_gaia['CLASS_PHOTO'] == 'QSO'] # Limit QSOs to proba thresholds thresholds = np.arange(x_lim[0], x_lim[1], step) for thr in thresholds: qso_x_gaia_limited = qso_x_gaia.loc[qso_x_gaia['QSO_PHOTO'] >= thr] for motion in motions: # Get stats (mu, sigma) = stats.norm.fit(qso_x_gaia_limited[motion]) median = np.median(qso_x_gaia_limited[motion]) error = sigma / math.sqrt(qso_x_gaia_limited.shape[0]) # Store values mu_dict[motion].append(mu) sigma_dict[motion].append(sigma) median_dict[motion].append(median) error_dict[motion].append(error) # Plot statistics to_plot = [((mu_dict, error_dict), 'mean'), (sigma_dict, 'sigma'), (median_dict, 'median')] color_palette = get_cubehelix_palette(len(motions)) for t in to_plot: plt.figure() label = None for i, motion in enumerate(motions): if len(motions) != 1: label = motion if t[1] == 'mean': vals = t[0][0][motion] errors = t[0][1][motion] else: vals = t[0][motion] errors = None plt.plot(thresholds, vals, label=label, color=color_palette[i], linestyle=get_line_style(i)) ax = plt.gca() if errors: lower = np.array(vals) - np.array(errors) / 2 upper = np.array(vals) + np.array(errors) / 2 ax.fill_between(thresholds, lower, upper, color=color_palette[i], alpha=0.2) if t[1] == 'mean' and mean_y_lines is not None: x_lim = ax.get_xlim() thr_x_lim = np.arange(x_lim[0], x_lim[1] + 0.01, 0.01) for line_name, y, y_err in mean_y_lines: plt.axhline(y, linestyle='--', color='b') ax.fill_between(thr_x_lim, y - y_err / 2, y + y_err / 2, color='b', alpha=0.2) plt.text( thresholds[0] + 0.01 * abs(max(thresholds) - min(thresholds)), y + 0.06 * abs(max(vals) - min(vals)), line_name) ax.set_xlim(x_lim) plt.xlabel('minimum classification probability') plt.ylabel('{} parallax {}'.format(t[1], '[mas]')) if label: plt.legend(framealpha=1.0)