Python do_hdbscanの例、apolo.clustering.ctools.do_hdbscan Pythonの例

コード例 #1

0

ファイルを表示

def clustering_routine(region_of_interest, tile, space_param='Phot+PM', data_dir=dirconfig.cross_vvv_2mass_combis_gaia,
                       out_dir=dirconfig.test_knowncl, radial_size=2.0, cluster_selection_method='leaf'):
    """
    This routine take a cluster object and a tile to perform a clustering using best values from Silhouette score
    (assuming mcs=ms) and using data in defined datadir directory
    :param out_dir: String. Path to the output dir
    :param data_dir: string
    :param space_param: String indicating the space param
    :param region_of_interest: StellarCluster or EmptyRegion object
    :param tile: tile object
    :param cluster_selection_method:
    :param radial_size:
    :return:

    Parameters
    ----------
    radial_size
    cluster_selection_method
    cluster_selection_method
    """

    print(region_of_interest, tile)
    catalog_file = tile.get_file(data_dir)
    tile_region = setup_region(catalog_file, region_of_interest, times=radial_size)
    scores = perform_grid_score(tile_region,
                                mcs_range=(5, 50),
                                ms_range=(5, 50),
                                space_param=space_param,
                                cols=None,
                                cluster_selection_method=cluster_selection_method,
                                noise_cluster=False,
                                make_plots=False,
                                out_dir=out_dir)

    score_filepath = path.join(out_dir, 'scores_' + region_of_interest.name + '.ecsv')
    scores.write(score_filepath, format='ascii.ecsv')

    summarized_scores = summarize_score(scores)
    score_filepath = path.join(out_dir, 'score-summary_' + region_of_interest.name + '.ecsv')
    summarized_scores.write(score_filepath, format='ascii.ecsv')

    best_mcs = summarized_scores['mcs_start'][0]
    best_ms = summarized_scores['ms'][0]

    ctools.do_hdbscan(tile_region,
                      space_param=space_param,
                      cols=None,
                      min_cluster_size=int(best_mcs),
                      min_samples=int(best_ms),
                      cluster_selection_method=cluster_selection_method)

    cplots.plot_clustered_data(tile_region, out_dir, summarized_scores)

コード例 #2

0

ファイルを表示

def fix_hyperparms_routine(region_of_interest, tile, min_cluster_size, min_samples,
                           space_param='Phot+PM',
                           data_dir=dirconfig.cross_vvv_2mass_combis_gaia,
                           out_dir=dirconfig.test_knowncl,
                           radial_size=2.0,
                           cluster_selection_method='leaf'):
    """
    This routine take a cluster object and a tile to perform a clustering using provided hyper-parameters (mcs, ms, csm)
    and using data in defined datadir directory

    Parameters
    ----------
    radial_size
    region_of_interest
    tile
    min_cluster_size
    min_samples
    space_param
    data_dir
    out_dir
    cluster_selection_method

    Returns
    -------

    """

    print(region_of_interest, tile)
    catalog_file = tile.get_file(data_dir)
    tile_region = setup_region(catalog_file, region_of_interest, times=radial_size)

    ctools.do_hdbscan(tile_region,
                      space_param=space_param,
                      cols=None,
                      min_cluster_size=int(min_cluster_size),
                      min_samples=int(min_samples),
                      cluster_selection_method=cluster_selection_method)

    cplots.plot_clustered_data(tile_region, out_dir)

コード例 #3

0

ファイルを表示

ファイル: grid.py プロジェクト: jorgeanais/apolo

def perform_kounkel_grid_score(input_table,
                               range_params=(5, 16),
                               step=1,
                               space_param='Phot+PM',
                               cols=None,
                               cluster_selection_method='leaf'):
    """
    This function perform the clustering algorithm in a 'small' region of the data where we know before hand
    that exists a stellar-cluster. This functions runs a simpler version of perform_grid_score,
    that scan using min_cluster_size = min_samples (À la Kounkel et al 2019), which make it run faster.
    Returned values correspond to Silhouette score. This number tells you how good is the clustering, also it
    returns the number of cluster detected for each combination of parameters. It returns an astropy-table with
     the results in order from best to worst.
    """

    pmin, pmax = range_params

    # Make a grid of parameters
    grid_of_params = np.arange(pmin, pmax, step)

    results = Table(names=('mcs', 'ms', 'n_clusters', 'score'))

    for param_value in grid_of_params:
        copy_table = input_table.copy()
        data, clusterer = ctools.do_hdbscan(
            copy_table,
            space_param=space_param,
            cols=cols,
            min_cluster_size=int(param_value),
            min_samples=int(param_value),
            cluster_selection_method=cluster_selection_method)

        n_cluster = len(np.unique(clusterer.labels_))
        if n_cluster > 1:
            score = metrics.silhouette_score(data,
                                             clusterer.labels_,
                                             metric='euclidean')
            r = [param_value, param_value, n_cluster, score]
            results.add_row(r)
        else:
            score = np.nan

        print(param_value, score)

    results.sort('score', reverse=True)

    return results

コード例 #4

0

ファイルを表示

ファイル: grid.py プロジェクト: jorgeanais/apolo

def perform_grid_score(input_table,
                       mcs_range=(5, 16),
                       ms_range=(5, 11),
                       step=1,
                       space_param='Phot+PM',
                       cols=None,
                       cluster_selection_method='leaf',
                       noise_cluster=True,
                       make_plots=False,
                       out_dir='',
                       memory=Memory(location=None)):
    """
    This function perform the clustering algorithm in a 'small' region of the data
    where we know before hand that exists a cluster. It returns the values of the
    Silhouette score. This number tells you how good is the clustering, also it
    returns the number of cluster detected for each combination of parameters.
    It returns an astropy-table with the results in order from best to worst.
    clsm param is the cluster selection method for hdbscan.
    If noise_cluser is True, noise sources are not considered in the quality clustering score.
    """

    mcs_min, mcs_max = mcs_range
    ms_min, ms_max = ms_range

    # Make a grid of parameters
    r_min_cluster_size = np.arange(mcs_min, mcs_max, step)
    r_min_samples = np.arange(ms_min, ms_max, step)
    grid_of_params = ((mcs, ms) for ms in r_min_samples
                      for mcs in r_min_cluster_size)

    results = Table(names=('mcs', 'ms', 'n_clusters', 'score', 'min_score',
                           'max_score', 'ch_score'))

    for mcs, ms in grid_of_params:
        copy_table = input_table.copy()
        data, clusterer = ctools.do_hdbscan(
            copy_table,
            space_param=space_param,
            cols=cols,
            min_cluster_size=int(mcs),
            min_samples=int(ms),
            cluster_selection_method=cluster_selection_method,
            memory=memory)

        n_cluster = clusterer.labels_.max() + 2

        if noise_cluster:
            not_noise = np.where(clusterer.labels_ != -1)
            data = data[not_noise]
            clusterer.labels_ = clusterer.labels_[not_noise]

        if n_cluster > 1:
            score = metrics.silhouette_score(data,
                                             clusterer.labels_,
                                             metric='euclidean')
            min_silhouette_value = np.min(
                metrics.silhouette_samples(data,
                                           clusterer.labels_,
                                           metric='euclidean'))
            max_silhouette_value = np.max(
                metrics.silhouette_samples(data,
                                           clusterer.labels_,
                                           metric='euclidean'))
            ch_score = metrics.calinski_harabasz_score(data, clusterer.labels_)
            r = [
                mcs, ms, n_cluster, score, min_silhouette_value,
                max_silhouette_value, ch_score
            ]
            results.add_row(r)
            copy_table.meta.update({'SCORE': score})
            copy_table.meta.update({'MAXSCORE': max_silhouette_value})
            copy_table.meta.update({'CH_SCORE': ch_score})
            if make_plots:
                cplots.plot_clustered_data(copy_table, out_dir)

    results.sort(['score'], reverse=True)

    return results

コード例 #5

0

ファイルを表示

def tile_routine(tile_file, output_dir, space_param='Mini-alternative',):
    """
    This function implement a routine to be used with separate tiles files, including read file, add pseudo-color
    column,
    Parameters
    ----------
    space_param
    tile_file
    output_dir

    Returns
    -------

    """
    table = read_fits_table(tile_file)
    tile_name = path.splitext(path.basename(tile_file))[0]

    # Check if file exists, if so return False
    expected_filename = path.join(output_dir, tile_name)
    if glob(expected_filename + '*'):
        print(f'Tile {tile_name} already processed. Skipping...')
        return False

    table.meta.update({'FILE': path.basename(tile_file)})
    table.meta.update({'TILENAME': tile_name})

    print('Processing', tile_name)

    add_pseudocolor(table, color_excess=1.8)
    scores = perform_grid_score(table,
                                mcs_range=(5, 50),
                                ms_range=(5, 50),
                                space_param=space_param,
                                cols=None,
                                cluster_selection_method='leaf',
                                noise_cluster=False,
                                make_plots=False,
                                out_dir=output_dir)

    # In case that clustering was not successfully return False
    if len(scores) == 0:
        print('-' * 20)
        print('No clusters found in tile: ', tile_name)
        print('-' * 20)
        return False

    score_filepath = path.join(output_dir, 'scores_' + tile_name + '.ecsv')
    scores.write(score_filepath, format='ascii.ecsv')

    summarized_scores = summarize_score(scores)
    score_filepath = path.join(output_dir, 'summary_' + tile_name + '.ecsv')
    summarized_scores.write(score_filepath, format='ascii.ecsv')

    best_mcs = summarized_scores['mcs_start'][0]
    best_ms = summarized_scores['ms'][0]

    ctools.do_hdbscan(table,
                      space_param=space_param,
                      cols=None,
                      min_cluster_size=int(best_mcs),
                      min_samples=int(best_ms),
                      cluster_selection_method='leaf')

    cplots.plot_clustered_data(table, output_dir, summarized_scores)

    return True

コード例 #6

0

ファイルを表示

ファイル: basic_clustering.py プロジェクト: jorgeanais/apolo

# Define the output dir
output_dir = dirconfig.test_knowncl

# Select stellar-cluster and respective tile
stellar_cluster = objects.cl74
tile = utils.which_tile(stellar_cluster, objects.all_tiles)[0]

# Define the catalog and the region (in terms of l and b) to be explored
catalog_dir = dirconfig.cross_vvv_2mass_combis_gaia

catalog_file = tile.get_file(
    catalog_dir
)  # This finds automatically the respective tile-file inside catalog_dir
region = utils.setup_region(
    catalog_file, stellar_cluster,
    times=2.0)  # Only a region of 2 times nominal SC radius

# Perform HDBSCAN clustering algorithm. This function update region table adding two columns: label and probabilities
# and adds metadata relative to the clustering itself.
data, clusterer = ctools.do_hdbscan(
    region,
    space_param='Phot+PM',  # 'Phot+PM' 'Colors+PM'
    min_cluster_size=5,
    min_samples=13,
    cluster_selection_method='leaf')

# This function produces multiple plots to help to visualize data. Also it saves the results of the clustering
# in a fits file.
cplots.plot_clustered_data(region, output_dir)

コード例 #7

0

ファイルを表示

ファイル: tile_at_once.py プロジェクト: jorgeanais/apolo

from apolo.clustering import cplots, ctools
from apolo.catalog_proc.utils import make_dir
from datetime import datetime
"""
Simple script to perform a simple clustering over an entire tile
"""

# Select tile
tile = objects.t070
print('Tile:', tile)

# Select a set of catalogs to be used
catalog_dir = dirconfig.cross_vvv_2mass_combis_gaia
catalog_file = tile.get_file(catalog_dir)

# Read catalog and add pseudocolor
table = utils.read_catalog(catalog_file)
utils.add_pseudocolor(table, color_excess=1.8)

# Perform hdbscan with selected parameters
startTime = datetime.now()
ctools.do_hdbscan(table,
                  space_param='Phot+PM',
                  min_cluster_size=12,
                  min_samples=5,
                  cluster_selection_method='leaf')
print(datetime.now() - startTime)

# Save results in selected directory
make_dir(dirconfig.test_tiles)
cplots.plot_clustered_data(table, dirconfig.test_tiles)

コード例 #8

0

ファイルを表示

ファイル: extract_sw_scores.py プロジェクト: jorgeanais/apolo

make_dir(out_dir)

far_end_cluster = objects.cl86
tile = which_tile(far_end_cluster, objects.all_tiles)[0]

# Alternativas: 'Colors+PM', 'Mini-alternative', 'Mini'
space_param = 'Mini-alternative'
mcs = 5
ms = 20

print(far_end_cluster, tile)
catalog_file = tile.get_file(data_dir)
tile_region = setup_region(catalog_file, far_end_cluster, times=2.0)
data, clusterer = ctools.do_hdbscan(tile_region,
                                    space_param=space_param,
                                    cols=None,
                                    min_cluster_size=mcs,
                                    min_samples=ms,
                                    cluster_selection_method='leaf')

print('unique labels:', np.unique(clusterer.labels_))
cplots.plot_clustered_data(tile_region, out_dir)

# Se calcula el SW para todas las fuentes
sw_i = metrics.silhouette_samples(data, clusterer.labels_, metric='euclidean')

# Calculamos el promedio del sw para el grupoo incluyendo las fuentes ruido
grupo = 0
match_label_grupo = np.where(clusterer.labels_ == grupo)
print('grupo: ', grupo, ' sw: ',np.mean(sw_i[match_label_grupo]))