Beispiel #1
0
def _save_metrics_and_queried_sample(
        database_class: DataBase,
        current_loop: int, output_metric_file_name: str,
        output_queried_file_name: str, batch: int, epoch: int,
        is_save_full_query: bool):
    """
    Saves metrics and queried sample data

    Parameters
    ----------
    database_class
        An instance of DataBase class
    current_loop
        Number of learning loops finished at this stage.
    output_metric_file_name
        Full path to file to store metrics results.
    output_queried_file_name
    batch
        Number of queries in each loop.
    epoch
        Days since the beginning of the survey.
    is_save_full_query
        If true, write down a complete queried sample stored in
        property 'queried_sample'. Otherwise append 1 line per loop to
        'queried_sample_file'. Default is False.
    """
    database_class.save_metrics(
        loop=current_loop, output_metrics_file=output_metric_file_name,
        batch=batch, epoch=epoch)
    if is_save_full_query:
        output_queried_file_name = (output_queried_file_name[:-4] +
                                    '_' + str(current_loop) + '.dat')
    database_class.save_queried_sample(
        output_queried_file_name, loop=current_loop,
        full_sample=is_save_full_query, epoch=epoch)
Beispiel #2
0
def _save_metrics_and_queried_samples(database_class: DataBase,
                                      metrics_file_name: str,
                                      queried_file_name: str,
                                      iteration_step: int,
                                      batch: int,
                                      full_sample: bool,
                                      file_name_suffix: str = None):
    """
    Save metrics and queried samples details

    Parameters
    ----------
    database_class
        An instance of DataBase class
    metrics_file_name
        Full path to file to store metrics results.
    queried_file_name
        Complete path to output file.
    iteration_step
        active learning iteration number
    batch
        Number of queries in each loop.
    full_sample
        If true, write down a complete queried sample stored in
        property 'queried_sample'. Otherwise append 1 line per loop to
        'queried_sample_file'. Default is False.
    file_name_suffix
        suffix string for save file name with file extension
    """
    if file_name_suffix is not None:
        metrics_file_name = metrics_file_name.replace('.dat', file_name_suffix)
        queried_file_name = queried_file_name.replace('.dat', file_name_suffix)
    database_class.save_metrics(loop=iteration_step,
                                output_metrics_file=metrics_file_name,
                                batch=batch,
                                epoch=iteration_step)
    database_class.save_queried_sample(queried_file_name,
                                       loop=iteration_step,
                                       full_sample=full_sample,
                                       epoch=iteration_step,
                                       batch=batch)
Beispiel #3
0
def learn_loop(nloops: int, strategy: str, path_to_features: str,
               output_diag_file: str, output_queried_file: str,
               features_method='Bazin', classifier='RandomForest',
               training='original', batch=1, screen=True, survey='DES',
               perc=0.1, nclass=2):
    """Perform the active learning loop. All results are saved to file.

    Parameters
    ----------
    nloops: int
        Number of active learning loops to run.
    strategy: str
        Query strategy. Options are 'UncSampling' and 'RandomSampling'.
    path_to_features: str or dict
        Complete path to input features file.
        if dict, keywords should be 'train' and 'test', 
        and values must contain the path for separate train 
        and test sample files.
    output_diag_file: str
        Full path to output file to store diagnostics of each loop.
    output_queried_file: str
        Full path to output file to store the queried sample.
    features_method: str (optional)
        Feature extraction method. Currently only 'Bazin' is implemented.
    classifier: str (optional)
        Machine Learning algorithm.
        Currently only 'RandomForest' is implemented.
    training: str or int (optional)
        Choice of initial training sample.
        If 'original': begin from the train sample flagged in the file
        If int: choose the required number of samples at random,
        ensuring that at least half are SN Ia
        Default is 'original'.
    batch: int (optional)
        Size of batch to be queried in each loop. Default is 1.
    screen: bool (optional)
        If True, print on screen number of light curves processed.
    survey: str (optional)
        'DES' or 'LSST'. Default is 'DES'.
        Name of the survey which characterizes filter set.
    perc: float in [0,1] (optioal)
        Percentile chosen to identify the new query. 
        Only used for PercentileSampling. 
        Default is 0.1.
    nclass: int (optional)
        Number of classes to consider in the classification
        Currently only nclass == 2 is implemented.    
    """

    ## This module will need to be expanded for RESSPECT

    # initiate object
    data = DataBase()

    # load features
    if isinstance(path_to_features, str):
        data.load_features(path_to_features, method=features_method,
                           screen=screen, survey=survey)

        # separate training and test samples
        data.build_samples(initial_training=training, nclass=nclass)

    else:
        data.load_features(path_to_features['train'], method=features_method,
                           screen=screen, survey=survey, sample='train')
        data.load_features(path_to_features['test'], method=features_method,
                           screen=screen, survey=survey, sample='test')

        data.build_samples(initial_training=training, nclass=nclass,
                           screen=screen, sep_files=True)
        
    for loop in range(nloops):

        if screen:
            print('Processing... ', loop)

        # classify
        data.classify(method=classifier)

        # calculate metrics
        data.evaluate_classification()

        # choose object to query
        indx = data.make_query(strategy=strategy, batch=batch, perc=perc)

        # update training and test samples
        data.update_samples(indx, loop=loop)

        # save diagnostics for current state
        data.save_metrics(loop=loop, output_metrics_file=output_diag_file,
                          batch=batch, epoch=loop)

        # save query sample to file
        data.save_queried_sample(output_queried_file, loop=loop,
                                 full_sample=False)
Beispiel #4
0
def time_domain_loop(days: list,  output_diag_file: str,
                     output_queried_file: str,
                     path_to_features_dir: str, strategy: str,
                     batch=1, canonical = False,  classifier='RandomForest',
                     features_method='Bazin', path_to_canonical="",
                     path_to_full_lc_features="",
                     screen=True, training='original'):
    """Perform the active learning loop. All results are saved to file.

    Parameters
    ----------
    days: list
        List of 2 elements. First and last day of observations since the
        beginning of the survey.
    output_diag_file: str
        Full path to output file to store diagnostics of each loop.
    output_queried_file: str
        Full path to output file to store the queried sample.
    path_to_features_dir: str
        Complete path to directory holding features files for all days.
    strategy: str
        Query strategy. Options are 'UncSampling' and 'RandomSampling'.
    batch: int (optional)
        Size of batch to be queried in each loop. Default is 1.
    canonical: bool (optional)
        If True, restrict the search to the canonical sample.
    classifier: str (optional)
        Machine Learning algorithm.
        Currently only 'RandomForest' is implemented.
    features_method: str (optional)
        Feature extraction method. Currently only 'Bazin' is implemented.
    path_to_canonical: str (optional)
        Path to canonical sample features files.
        It is only used if "strategy==canonical".
    path_to_full_lc_features: str (optional)
        Path to full light curve features file.
        Only used if training is a number.
    screen: bool (optional)
        If True, print on screen number of light curves processed.
    training: str or int (optional)
        Choice of initial training sample.
        If 'original': begin from the train sample flagged in the file
        If int: choose the required number of samples at random,
        ensuring that at least half are SN Ia
        Default is 'original'.

    """

    ## This will need to change for RESSPECT

    # initiate object
    data = DataBase()

    # load features for the first day
    path_to_features = path_to_features_dir + 'day_' + str(int(days[0])) + '.dat'
    data.load_features(path_to_features, method=features_method,
                       screen=screen)

    # change training
    if training == 'original':
        data.build_samples(initial_training='original')
        full_lc_features = get_original_training(path_to_features=path_to_full_lc_features)
        data.train_metadata = full_lc_features.train_metadata
        data.train_labels = full_lc_features.train_labels
        data.train_features = full_lc_features.train_features

    else:
        data.build_samples(initial_training=int(training))

    # get list of canonical ids
    if canonical:
        canonical = DataBase()
        canonical.load_features(path_to_file=path_to_canonical)
        data.queryable_ids = canonical.queryable_ids


    for night in range(int(days[0]), int(days[-1]) - 1):

        if screen:
            print('Processing night: ', night)

        # cont loop
        loop = night - int(days[0])

        # classify
        data.classify(method=classifier)

        # calculate metrics
        data.evaluate_classification()

        # choose object to query
        indx = data.make_query(strategy=strategy, batch=batch)

        # update training and test samples
        data.update_samples(indx, loop=loop)

        # save diagnostics for current state
        data.save_metrics(loop=loop, output_metrics_file=output_diag_file,
                          batch=batch, epoch=night)

        # save query sample to file
        data.save_queried_sample(output_queried_file, loop=loop,
                                 full_sample=False)

        # load features for next day
        path_to_features2 = path_to_features_dir + 'day_' + str(night + 1) + '.dat'

        data_tomorrow = DataBase()
        data_tomorrow.load_features(path_to_features2, method=features_method,
                                    screen=False)

        # identify objects in the new day which must be in training
        train_flag = np.array([item in data.train_metadata['id'].values 
                              for item in data_tomorrow.metadata['id'].values])
   
        # use new data        
        data.train_metadata = data_tomorrow.metadata[train_flag]
        data.train_features = data_tomorrow.features.values[train_flag]
        data.test_metadata = data_tomorrow.metadata[~train_flag]
        data.test_features = data_tomorrow.features.values[~train_flag]

        # new labels
        data.train_labels = np.array([int(item  == 'Ia') for item in 
                                     data.train_metadata['type'].values])
        data.test_labels = np.array([int(item == 'Ia') for item in 
                                    data.test_metadata['type'].values])

        if strategy == 'canonical':
            data.queryable_ids = canonical.queryable_ids

        if  queryable:
            queryable_flag = data_tomorrow.metadata['queryable'].values
            queryable_test_flag = np.logical_and(~train_flag, queryable_flag)
            data.queryable_ids = data_tomorrow.metadata['id'].values[queryable_test_flag]
        else:
            data.queryable_ids = data_tomorrow.metadata['id'].values[~train_flag]

        if screen:
            print('Training set size: ', data.train_metadata.shape[0])
            print('Test set size: ', data.test_metadata.shape[0])