def clustering_routine(region_of_interest, tile, space_param='Phot+PM', data_dir=dirconfig.cross_vvv_2mass_combis_gaia, out_dir=dirconfig.test_knowncl, radial_size=2.0, cluster_selection_method='leaf'): """ This routine take a cluster object and a tile to perform a clustering using best values from Silhouette score (assuming mcs=ms) and using data in defined datadir directory :param out_dir: String. Path to the output dir :param data_dir: string :param space_param: String indicating the space param :param region_of_interest: StellarCluster or EmptyRegion object :param tile: tile object :param cluster_selection_method: :param radial_size: :return: Parameters ---------- radial_size cluster_selection_method cluster_selection_method """ print(region_of_interest, tile) catalog_file = tile.get_file(data_dir) tile_region = setup_region(catalog_file, region_of_interest, times=radial_size) scores = perform_grid_score(tile_region, mcs_range=(5, 50), ms_range=(5, 50), space_param=space_param, cols=None, cluster_selection_method=cluster_selection_method, noise_cluster=False, make_plots=False, out_dir=out_dir) score_filepath = path.join(out_dir, 'scores_' + region_of_interest.name + '.ecsv') scores.write(score_filepath, format='ascii.ecsv') summarized_scores = summarize_score(scores) score_filepath = path.join(out_dir, 'score-summary_' + region_of_interest.name + '.ecsv') summarized_scores.write(score_filepath, format='ascii.ecsv') best_mcs = summarized_scores['mcs_start'][0] best_ms = summarized_scores['ms'][0] ctools.do_hdbscan(tile_region, space_param=space_param, cols=None, min_cluster_size=int(best_mcs), min_samples=int(best_ms), cluster_selection_method=cluster_selection_method) cplots.plot_clustered_data(tile_region, out_dir, summarized_scores)
def fix_hyperparms_routine(region_of_interest, tile, min_cluster_size, min_samples, space_param='Phot+PM', data_dir=dirconfig.cross_vvv_2mass_combis_gaia, out_dir=dirconfig.test_knowncl, radial_size=2.0, cluster_selection_method='leaf'): """ This routine take a cluster object and a tile to perform a clustering using provided hyper-parameters (mcs, ms, csm) and using data in defined datadir directory Parameters ---------- radial_size region_of_interest tile min_cluster_size min_samples space_param data_dir out_dir cluster_selection_method Returns ------- """ print(region_of_interest, tile) catalog_file = tile.get_file(data_dir) tile_region = setup_region(catalog_file, region_of_interest, times=radial_size) ctools.do_hdbscan(tile_region, space_param=space_param, cols=None, min_cluster_size=int(min_cluster_size), min_samples=int(min_samples), cluster_selection_method=cluster_selection_method) cplots.plot_clustered_data(tile_region, out_dir)
def perform_kounkel_grid_score(input_table, range_params=(5, 16), step=1, space_param='Phot+PM', cols=None, cluster_selection_method='leaf'): """ This function perform the clustering algorithm in a 'small' region of the data where we know before hand that exists a stellar-cluster. This functions runs a simpler version of perform_grid_score, that scan using min_cluster_size = min_samples (À la Kounkel et al 2019), which make it run faster. Returned values correspond to Silhouette score. This number tells you how good is the clustering, also it returns the number of cluster detected for each combination of parameters. It returns an astropy-table with the results in order from best to worst. """ pmin, pmax = range_params # Make a grid of parameters grid_of_params = np.arange(pmin, pmax, step) results = Table(names=('mcs', 'ms', 'n_clusters', 'score')) for param_value in grid_of_params: copy_table = input_table.copy() data, clusterer = ctools.do_hdbscan( copy_table, space_param=space_param, cols=cols, min_cluster_size=int(param_value), min_samples=int(param_value), cluster_selection_method=cluster_selection_method) n_cluster = len(np.unique(clusterer.labels_)) if n_cluster > 1: score = metrics.silhouette_score(data, clusterer.labels_, metric='euclidean') r = [param_value, param_value, n_cluster, score] results.add_row(r) else: score = np.nan print(param_value, score) results.sort('score', reverse=True) return results
def perform_grid_score(input_table, mcs_range=(5, 16), ms_range=(5, 11), step=1, space_param='Phot+PM', cols=None, cluster_selection_method='leaf', noise_cluster=True, make_plots=False, out_dir='', memory=Memory(location=None)): """ This function perform the clustering algorithm in a 'small' region of the data where we know before hand that exists a cluster. It returns the values of the Silhouette score. This number tells you how good is the clustering, also it returns the number of cluster detected for each combination of parameters. It returns an astropy-table with the results in order from best to worst. clsm param is the cluster selection method for hdbscan. If noise_cluser is True, noise sources are not considered in the quality clustering score. """ mcs_min, mcs_max = mcs_range ms_min, ms_max = ms_range # Make a grid of parameters r_min_cluster_size = np.arange(mcs_min, mcs_max, step) r_min_samples = np.arange(ms_min, ms_max, step) grid_of_params = ((mcs, ms) for ms in r_min_samples for mcs in r_min_cluster_size) results = Table(names=('mcs', 'ms', 'n_clusters', 'score', 'min_score', 'max_score', 'ch_score')) for mcs, ms in grid_of_params: copy_table = input_table.copy() data, clusterer = ctools.do_hdbscan( copy_table, space_param=space_param, cols=cols, min_cluster_size=int(mcs), min_samples=int(ms), cluster_selection_method=cluster_selection_method, memory=memory) n_cluster = clusterer.labels_.max() + 2 if noise_cluster: not_noise = np.where(clusterer.labels_ != -1) data = data[not_noise] clusterer.labels_ = clusterer.labels_[not_noise] if n_cluster > 1: score = metrics.silhouette_score(data, clusterer.labels_, metric='euclidean') min_silhouette_value = np.min( metrics.silhouette_samples(data, clusterer.labels_, metric='euclidean')) max_silhouette_value = np.max( metrics.silhouette_samples(data, clusterer.labels_, metric='euclidean')) ch_score = metrics.calinski_harabasz_score(data, clusterer.labels_) r = [ mcs, ms, n_cluster, score, min_silhouette_value, max_silhouette_value, ch_score ] results.add_row(r) copy_table.meta.update({'SCORE': score}) copy_table.meta.update({'MAXSCORE': max_silhouette_value}) copy_table.meta.update({'CH_SCORE': ch_score}) if make_plots: cplots.plot_clustered_data(copy_table, out_dir) results.sort(['score'], reverse=True) return results
def tile_routine(tile_file, output_dir, space_param='Mini-alternative',): """ This function implement a routine to be used with separate tiles files, including read file, add pseudo-color column, Parameters ---------- space_param tile_file output_dir Returns ------- """ table = read_fits_table(tile_file) tile_name = path.splitext(path.basename(tile_file))[0] # Check if file exists, if so return False expected_filename = path.join(output_dir, tile_name) if glob(expected_filename + '*'): print(f'Tile {tile_name} already processed. Skipping...') return False table.meta.update({'FILE': path.basename(tile_file)}) table.meta.update({'TILENAME': tile_name}) print('Processing', tile_name) add_pseudocolor(table, color_excess=1.8) scores = perform_grid_score(table, mcs_range=(5, 50), ms_range=(5, 50), space_param=space_param, cols=None, cluster_selection_method='leaf', noise_cluster=False, make_plots=False, out_dir=output_dir) # In case that clustering was not successfully return False if len(scores) == 0: print('-' * 20) print('No clusters found in tile: ', tile_name) print('-' * 20) return False score_filepath = path.join(output_dir, 'scores_' + tile_name + '.ecsv') scores.write(score_filepath, format='ascii.ecsv') summarized_scores = summarize_score(scores) score_filepath = path.join(output_dir, 'summary_' + tile_name + '.ecsv') summarized_scores.write(score_filepath, format='ascii.ecsv') best_mcs = summarized_scores['mcs_start'][0] best_ms = summarized_scores['ms'][0] ctools.do_hdbscan(table, space_param=space_param, cols=None, min_cluster_size=int(best_mcs), min_samples=int(best_ms), cluster_selection_method='leaf') cplots.plot_clustered_data(table, output_dir, summarized_scores) return True
# Define the output dir output_dir = dirconfig.test_knowncl # Select stellar-cluster and respective tile stellar_cluster = objects.cl74 tile = utils.which_tile(stellar_cluster, objects.all_tiles)[0] # Define the catalog and the region (in terms of l and b) to be explored catalog_dir = dirconfig.cross_vvv_2mass_combis_gaia catalog_file = tile.get_file( catalog_dir ) # This finds automatically the respective tile-file inside catalog_dir region = utils.setup_region( catalog_file, stellar_cluster, times=2.0) # Only a region of 2 times nominal SC radius # Perform HDBSCAN clustering algorithm. This function update region table adding two columns: label and probabilities # and adds metadata relative to the clustering itself. data, clusterer = ctools.do_hdbscan( region, space_param='Phot+PM', # 'Phot+PM' 'Colors+PM' min_cluster_size=5, min_samples=13, cluster_selection_method='leaf') # This function produces multiple plots to help to visualize data. Also it saves the results of the clustering # in a fits file. cplots.plot_clustered_data(region, output_dir)
from apolo.clustering import cplots, ctools from apolo.catalog_proc.utils import make_dir from datetime import datetime """ Simple script to perform a simple clustering over an entire tile """ # Select tile tile = objects.t070 print('Tile:', tile) # Select a set of catalogs to be used catalog_dir = dirconfig.cross_vvv_2mass_combis_gaia catalog_file = tile.get_file(catalog_dir) # Read catalog and add pseudocolor table = utils.read_catalog(catalog_file) utils.add_pseudocolor(table, color_excess=1.8) # Perform hdbscan with selected parameters startTime = datetime.now() ctools.do_hdbscan(table, space_param='Phot+PM', min_cluster_size=12, min_samples=5, cluster_selection_method='leaf') print(datetime.now() - startTime) # Save results in selected directory make_dir(dirconfig.test_tiles) cplots.plot_clustered_data(table, dirconfig.test_tiles)
make_dir(out_dir) far_end_cluster = objects.cl86 tile = which_tile(far_end_cluster, objects.all_tiles)[0] # Alternativas: 'Colors+PM', 'Mini-alternative', 'Mini' space_param = 'Mini-alternative' mcs = 5 ms = 20 print(far_end_cluster, tile) catalog_file = tile.get_file(data_dir) tile_region = setup_region(catalog_file, far_end_cluster, times=2.0) data, clusterer = ctools.do_hdbscan(tile_region, space_param=space_param, cols=None, min_cluster_size=mcs, min_samples=ms, cluster_selection_method='leaf') print('unique labels:', np.unique(clusterer.labels_)) cplots.plot_clustered_data(tile_region, out_dir) # Se calcula el SW para todas las fuentes sw_i = metrics.silhouette_samples(data, clusterer.labels_, metric='euclidean') # Calculamos el promedio del sw para el grupoo incluyendo las fuentes ruido grupo = 0 match_label_grupo = np.where(clusterer.labels_ == grupo) print('grupo: ', grupo, ' sw: ',np.mean(sw_i[match_label_grupo]))