Exemple #1
0
def _update_training_data_with_new_features(
        light_curve_data: DataBase, next_day_data: DataBase, metadata_value: int,
        id_key_name: str) -> DataBase:
    """
    Updates new features of the training with new metadata value

    Parameters
    ----------
    light_curve_data
        light curve data
    next_day_data
        next day light curve data
    id_key_name
        object identification key name
    metadata_value
        metadata object value
    """
    next_day_pool_data_flag = (
            next_day_data.pool_metadata[id_key_name].values == metadata_value)
    light_curve_data.train_metadata = pd.concat(
        [light_curve_data.train_metadata,
         next_day_data.pool_metadata[next_day_pool_data_flag]],
        axis=0, ignore_index=True)
    light_curve_data.train_features = np.append(
        light_curve_data.train_features,
        next_day_data.pool_features[next_day_pool_data_flag], axis=0)
    light_curve_data.train_labels = np.append(
        light_curve_data.train_labels,
        next_day_data.pool_labels[next_day_pool_data_flag], axis=0)
    return light_curve_data
Exemple #2
0
def _save_metrics_and_queried_sample(
        database_class: DataBase,
        current_loop: int, output_metric_file_name: str,
        output_queried_file_name: str, batch: int, epoch: int,
        is_save_full_query: bool):
    """
    Saves metrics and queried sample data

    Parameters
    ----------
    database_class
        An instance of DataBase class
    current_loop
        Number of learning loops finished at this stage.
    output_metric_file_name
        Full path to file to store metrics results.
    output_queried_file_name
    batch
        Number of queries in each loop.
    epoch
        Days since the beginning of the survey.
    is_save_full_query
        If true, write down a complete queried sample stored in
        property 'queried_sample'. Otherwise append 1 line per loop to
        'queried_sample_file'. Default is False.
    """
    database_class.save_metrics(
        loop=current_loop, output_metrics_file=output_metric_file_name,
        batch=batch, epoch=epoch)
    if is_save_full_query:
        output_queried_file_name = (output_queried_file_name[:-4] +
                                    '_' + str(current_loop) + '.dat')
    database_class.save_queried_sample(
        output_queried_file_name, loop=current_loop,
        full_sample=is_save_full_query, epoch=epoch)
Exemple #3
0
def test_load_bazin_features():
    """Test loading Bazin features."""
    
    # test full light curve case
    fname1 = testing.download_data("tests/Bazin_SNPCC1.dat")
    
    data1 = DataBase()
    data1.load_bazin_features(path_to_bazin_file=fname1, 
                             screen=True, survey='DES', sample=None)
    
    # read data independently
    data_temp1 = pd.read_csv(fname1, sep=" ")        
    
    sizes1 = len(data_temp1.keys()) == \
            len(data1.features_names) - 1 + len(data1.metadata_names)
    
    queryable1 = 'queryable' in data1.metadata_names
    
    # test time domain case
    fname2 = testing.download_data('tests/day_20.dat')
    
    data2 = DataBase()
    data2.load_bazin_features(path_to_bazin_file=fname2, screen=True,
                              survey='DES', sample=None)
                              
    data_temp2 = pd.read_csv(fname2)
    
    sizes2 = len(data_temp2.keys()) == len(data2.features_names) + \
                                len(data2.metadata_names)
                                
    queryable2 = 'queryable' in data2.metadata_names
    
    assert (sizes1 and queryable1)
    assert (sizes2 and queryable2)
Exemple #4
0
def run_evaluation(database_class: DataBase, metric_label: str):
    """
    Evaluates the active learning model

    Parameters
    ----------
    database_class
        An instance of DataBase class
    metric_label
        Choice of metric.
        Currently only "snpcc", "cosmo" or "snpcc_cosmo" are accepted.
        Default is "snpcc".

    """
    database_class.evaluate_classification(metric_label=metric_label)
Exemple #5
0
def run_make_query(database_class: DataBase, strategy: str, batch_size: int,
                   is_queryable: bool):
    """
    Run active learning query process

    Parameters
    ----------
    database_class
        An instance of DataBase class
    strategy
        Strategy used to choose the most informative object.
        Current implementation accepts 'UncSampling' and
        'RandomSampling', 'UncSamplingEntropy',
        'UncSamplingLeastConfident', 'UncSamplingMargin',
        'QBDMI', 'QBDEntropy', . Default is `UncSampling`.
    batch_size
        Number of objects to be chosen in each batch query.
        Default is 1
    is_queryable
        If True, consider only queryable objects.
        Default is False.
    """
    return database_class.make_query(strategy=strategy,
                                     batch=batch_size,
                                     queryable=is_queryable)
Exemple #6
0
def run_classification(database_class: DataBase, classifier: str,
                       is_classifier_bootstrap: bool, prediction_dir: str,
                       is_save_prediction: bool, iteration_step: int,
                       **kwargs: dict) -> DataBase:
    """
    Run active learning classification model

    Parameters
    ----------
    database_class
        An instance of DataBase class
    classifier
        Machine Learning algorithm.
        Currently implemented options are 'RandomForest', 'GradientBoostedTrees',
        'KNNclassifier','MLPclassifier','SVMclassifier' and 'NBclassifier'.
        Default is 'RandomForest'.
    is_classifier_bootstrap
        if tp apply a machine learning classifier by bootstrapping
    prediction_dir
       Output directory to store prediction file for each loop.
       Only used if `save_predictions==True
    is_save_prediction
        if predictions should be saved
    iteration_step
        active learning iteration number
    kwargs
       All keywords required by the classifier function.
    -------

    """
    if is_classifier_bootstrap:
        database_class.classify_bootstrap(method=classifier,
                                          loop=iteration_step,
                                          pred_dir=prediction_dir,
                                          save_predictions=is_save_prediction,
                                          **kwargs)
    else:
        database_class.classify(method=classifier,
                                pred_dir=prediction_dir,
                                loop=iteration_step,
                                save_predictions=is_save_prediction,
                                **kwargs)
    return database_class
Exemple #7
0
def _update_next_day_pool_data(next_day_data: DataBase,
                               next_day_pool_metadata_indices) -> DataBase:
    """
    Removes metadata value data from next day pool sample

    Parameters
    ----------
    next_day_data
        next day light curve data
    next_day_pool_metadata_indices
        indices of metadata value in next day light curve data
    """
    # remove obj from pool sample
    next_day_data.pool_metadata = next_day_data.pool_metadata.drop(
        next_day_data.pool_metadata.index[next_day_pool_metadata_indices])
    next_day_data.pool_labels = np.delete(
        next_day_data.pool_labels, next_day_pool_metadata_indices, axis=0)
    next_day_data.pool_features = np.delete(
        next_day_data.pool_features, next_day_pool_metadata_indices, axis=0)
    return next_day_data
Exemple #8
0
def _update_samples_with_object_indices(
        database_class: DataBase, object_indices: list,
        is_queryable: bool, epoch: int) -> DataBase:
    """
    Runs database class update_samples methods with object indices

    Parameters
    ----------
    database_class
        An instance of DataBase class
    object_indices
        List of indexes identifying objects to be moved.
    is_queryable
        If True, consider queryable flag. Default is False.
    epoch
        Day since beginning of survey. Default is 20.
    """
    database_class.update_samples(
        object_indices, queryable=is_queryable, epoch=epoch)
    return database_class
Exemple #9
0
def _save_metrics_and_queried_samples(database_class: DataBase,
                                      metrics_file_name: str,
                                      queried_file_name: str,
                                      iteration_step: int,
                                      batch: int,
                                      full_sample: bool,
                                      file_name_suffix: str = None):
    """
    Save metrics and queried samples details

    Parameters
    ----------
    database_class
        An instance of DataBase class
    metrics_file_name
        Full path to file to store metrics results.
    queried_file_name
        Complete path to output file.
    iteration_step
        active learning iteration number
    batch
        Number of queries in each loop.
    full_sample
        If true, write down a complete queried sample stored in
        property 'queried_sample'. Otherwise append 1 line per loop to
        'queried_sample_file'. Default is False.
    file_name_suffix
        suffix string for save file name with file extension
    """
    if file_name_suffix is not None:
        metrics_file_name = metrics_file_name.replace('.dat', file_name_suffix)
        queried_file_name = queried_file_name.replace('.dat', file_name_suffix)
    database_class.save_metrics(loop=iteration_step,
                                output_metrics_file=metrics_file_name,
                                batch=batch,
                                epoch=iteration_step)
    database_class.save_queried_sample(queried_file_name,
                                       loop=iteration_step,
                                       full_sample=full_sample,
                                       epoch=iteration_step,
                                       batch=batch)
Exemple #10
0
def save_photo_ids(database_class: DataBase,
                   is_save_photoids_to_file: bool,
                   is_save_snana_types: bool,
                   metadata_fname: str,
                   photo_class_threshold: float,
                   iteration_step: int,
                   file_name_prefix: str = None,
                   file_name_suffix: str = None):
    """
    Function to save photo IDs to a file

    Parameters
    ----------
    database_class
        An instance of DataBase class
    is_save_photoids_to_file
        If true, populate the photo_Ia_list attribute. Otherwise
        write to file. Default is False.
    is_save_snana_types
        if True, translate type to SNANA codes and
        add column with original values. Default is False.
    metadata_fname
        Full path to PLAsTiCC zenodo test metadata file.
    photo_class_threshold
         Probability threshold above which an object is considered Ia.
    iteration_step
        active learning iteration number
    file_name_suffix
        suffix string for save file name with file extension
    file_name_prefix
        prefix string for save file name
    """
    if is_save_photoids_to_file or is_save_snana_types:
        file_name = file_name_prefix + '_' + str(
            iteration_step) + file_name_suffix
        database_class.output_photo_Ia(photo_class_threshold,
                                       to_file=is_save_photoids_to_file,
                                       filename=file_name,
                                       SNANA_types=is_save_snana_types,
                                       metadata_fname=metadata_fname)
Exemple #11
0
def _update_data_by_remove_repeated_ids(first_loop_data: DataBase,
                                        light_curve_data: DataBase,
                                        id_key_name: str,
                                        pool_labels_class: str = 'Ia') -> Tuple[
        DataBase, DataBase]:
    """
    Updates first loop and initial data by removing repetitive id indices

    Parameters
    ----------
    first_loop_data
        first loop light curve data
    light_curve_data
        initial light curve training data
    id_key_name
        object identification key name
    pool_labels_class
        pool labels class name
    """
    repeated_id_flags = np.in1d(
        first_loop_data.pool_metadata[id_key_name].values,
        light_curve_data.train_metadata[id_key_name].values)
    first_loop_data.pool_metadata = first_loop_data.pool_metadata[
        ~repeated_id_flags]
    first_loop_data.pool_features = first_loop_data.pool_features[
        ~repeated_id_flags]
    pool_labels = (
            first_loop_data.pool_metadata['type'].values == pool_labels_class)
    first_loop_data.pool_labels = pool_labels.astype(int)
    light_curve_data.pool_features = first_loop_data.pool_features
    light_curve_data.pool_metadata = first_loop_data.pool_metadata
    light_curve_data.pool_labels = first_loop_data.pool_labels
    return first_loop_data, light_curve_data
Exemple #12
0
def _run_classification_and_evaluation(
        database_class: DataBase, classifier: str,
        is_classifier_bootstrap: bool, **kwargs: dict) -> DataBase:
    """
    Runs active learning classification and evaluation methods

    Parameters
    ----------
    database_class
        An instance of DataBase class
    classifier
        Machine Learning algorithm.
        Currently 'RandomForest', 'GradientBoostedTrees',
        'KNN', 'MLP', 'SVM' and 'NB' are implemented.
        Default is 'RandomForest'.
    is_classifier_bootstrap
        if tp apply a machine learning classifier by bootstrapping
    kwargs
       All keywords required by the classifier function.
    """
    if is_classifier_bootstrap:
        database_class.classify_bootstrap(method=classifier, **kwargs)
    else:
        database_class.classify(method=classifier, **kwargs)
    database_class.evaluate_classification()
    return database_class
Exemple #13
0
def _update_canonical_ids(light_curve_data: DataBase,
                          canonical_file_name: str,
                          is_restrict_canonical: bool) -> Tuple[
        DataBase, DataBase]:
    """
    Updates canonical ids

    Parameters
    ----------
    light_curve_data
        initial light curve training data
    canonical_file_name
        Path to canonical sample features files.
        It is only used if "strategy==canonical".
    is_restrict_canonical
        If True, restrict the search to the canonical sample.
    """
    database_class = None
    if is_restrict_canonical:
        database_class = DataBase()
        database_class.load_features(path_to_file=canonical_file_name)
        light_curve_data.queryable_ids = database_class.queryable_ids
    return light_curve_data, database_class
Exemple #14
0
def _update_queryable_ids(light_curve_data: DataBase, id_key_name: str,
                          is_queryable: bool) -> DataBase:
    """
    Updates queryable ids

    Parameters
    ----------
    light_curve_data
        initial light curve training data
    id_key_name
        object identification key name
    is_queryable
        If True, allow queries only on objects flagged as queryable.
        Default is True.
    """
    if is_queryable:
        queryable_flags = light_curve_data.pool_metadata['queryable'].values
        light_curve_data.queryable_ids = light_curve_data.pool_metadata[
            id_key_name].values[queryable_flags]
    else:
        light_curve_data.queryable_ids = light_curve_data.pool_metadata[
            id_key_name].values
    return light_curve_data
Exemple #15
0
def _get_indices_of_objects_to_be_queried(
        database_class: DataBase, strategy: str, budgets: tuple,
        is_queryable: bool, query_threshold: float, batch: int) -> list:
    """
    Finds indices of objects to be queried

    Parameters
    ----------
    database_class
        An instance of DataBase class
    strategy
        Query strategy. Options are (all can be run with budget):
        "UncSampling",
        "UncSamplingEntropy",
        "UncSamplingLeastConfident",
        "UncSamplingMargin",
        "QBDMI",
        "QBDEntropy",
        "RandomSampling",
    budgets
        Budgets for each of the telescopes
    is_queryable
        If True, allow queries only on objects flagged as queryable.
        Default is True.
    query_threshold
        Percentile threshold for query. Default is 1.
    batch
        Size of batch to be queried in each loop. Default is 1.
    """
    if budgets:
        object_indices = database_class.make_query_budget(
            budgets=budgets, strategy=strategy)
    else:
        object_indices = database_class.make_query(
            strategy=strategy, batch=batch, queryable=is_queryable,
            query_thre=query_threshold)
    return list(object_indices)
Exemple #16
0
def _remove_old_training_features(
        light_curve_data: DataBase, light_curve_metadata: np.ndarray,
        metadata_value: int):
    """
    Removes old training features

    Parameters
    ----------
    light_curve_data
        light curve training data
    light_curve_metadata
        light curve meta data
    metadata_value
        metadata object value
    """
    current_day_object_index = list(light_curve_metadata).index(
        metadata_value)
    light_curve_data.train_metadata = light_curve_data.train_metadata.drop(
        light_curve_data.train_metadata.index[current_day_object_index])
    light_curve_data.train_labels = np.delete(
        light_curve_data.train_labels, current_day_object_index, axis=0)
    light_curve_data.train_features = np.delete(
        light_curve_data.train_features, current_day_object_index, axis=0)
    return light_curve_data
Exemple #17
0
def _update_light_curve_data_val_and_test_data(
        light_curve_data: DataBase, first_loop_data: DataBase,
        is_separate_files: bool = False,
        initial_training: Union[str, int] = 'original',
        is_queryable: bool = False, number_of_classes: int = 2) -> DataBase:
    """
    Updates initial light curve validation and test data

    Parameters
    ----------
    light_curve_data
        initial light curve training data
    first_loop_data
        first loop light curve data
    is_queryable
        If True, allow queries only on objects flagged as queryable.
        Default is True.
    is_separate_files
        If True, consider samples separately read
        from independent files. Default is False.
    initial_training
        Choice of initial training sample.
        If 'original': begin from the train sample flagged in the file
        eilf 'previous': read training and queried from previous run.
        If int: choose the required number of samples at random,
        ensuring that at least half are SN Ia
        Default is 'original'.
    number_of_classes
        Number of classes to consider in the classification
        Currently only number_of_classes == 2 is implemented.
    """
    if is_separate_files:
        light_curve_data.build_samples(
            nclass=number_of_classes, queryable=is_queryable,
            sep_files=is_separate_files, initial_training=initial_training)
    else:
        light_curve_data.test_features = first_loop_data.pool_features
        light_curve_data.test_metadata = first_loop_data.pool_metadata
        light_curve_data.test_labels = first_loop_data.pool_labels

        light_curve_data.validation_features = first_loop_data.pool_features
        light_curve_data.validation_metadata = first_loop_data.pool_metadata
        light_curve_data.validation_labels = first_loop_data.pool_labels
    return light_curve_data
Exemple #18
0
def _update_queried_sample(light_curve_data: DataBase, next_day_data: DataBase,
                           id_key_name: str, metadata_value: int) -> DataBase:
    """
    Updates queried sample in light curve data

    Parameters
    ----------
    light_curve_data
        light curve data
    next_day_data
        next day light curve data
    id_key_name
        object identification key name
    metadata_value
        metadata object value
    """
    # build query data frame
    full_header_name = (['epoch'] + light_curve_data.metadata_names
                        + light_curve_data.features_names)
    queried_sample = pd.DataFrame(light_curve_data.queried_sample,
                                  columns=full_header_name)
    # get object index in the queried sample
    queried_index = list(
        queried_sample[id_key_name].values).index(metadata_value)
    # get flag to isolate object in question
    queried_values_flag = queried_sample[id_key_name].values == metadata_value
    # get object epoch in the queried sample
    metadata_value_epoch = queried_sample['epoch'].values[queried_values_flag]
    # remove old features from queried
    queried_sample = queried_sample.drop(queried_sample.index[queried_index])
    next_day_pool_data_flag = (
            next_day_data.pool_metadata[id_key_name].values == metadata_value)
    new_query_pool_metadata = list(next_day_data.pool_metadata[
                                       next_day_pool_data_flag].values[0])
    new_query_pool_features = list(next_day_data.pool_features[
                                       next_day_pool_data_flag][0])
    new_query = ([metadata_value_epoch[0]] + new_query_pool_metadata +
                 new_query_pool_features)
    new_query = pd.DataFrame([new_query], columns=full_header_name)
    queried_sample = pd.concat([queried_sample, new_query], axis=0,
                               ignore_index=True)
    # update queried sample
    light_curve_data.queried_sample = list(queried_sample.values)
    return light_curve_data
Exemple #19
0
def _update_initial_train_meta_data_header(
        first_loop_data: DataBase, light_curve_data: DataBase) -> DataBase:
    """
    Updates if all headers in test not exist in train

    Parameters
    ----------
    first_loop_data
        first loop light curve data
    light_curve_data
        light curve learning data

    """
    for each_name in first_loop_data.metadata_names:
        if each_name not in light_curve_data.metadata_names:
            light_curve_data.metadata_names.append(each_name)
            light_curve_data.metadata[each_name] = None
            light_curve_data.train_metadata.insert(
                len(light_curve_data.metadata_names) - 1, each_name, None, True)
    return light_curve_data
Exemple #20
0
def _update_next_day_val_and_test_data(
        next_day_data: DataBase, metadata_value: int,
        id_key_name: str) -> DataBase:
    """
    Removes metadata value data from next day validation and test samples

    Parameters
    ----------
    next_day_data
        next day light curve data
    metadata_value
        metadata object value
    id_key_name
        object identification key name
    """
    if (len(next_day_data.validation_metadata) > 0 and metadata_value
            in next_day_data.validation_metadata[id_key_name].values):
        val_indices = list(next_day_data.validation_metadata[
                               id_key_name].values).index(metadata_value)
        next_day_data.validation_metadata = (
            next_day_data.validation_metadata.drop(
                next_day_data.validation_metadata.index[val_indices]))
        next_day_data.validation_labels = np.delete(
            next_day_data.validation_labels, val_indices, axis=0)
        next_day_data.validation_features = np.delete(
            next_day_data.validation_features, val_indices, axis=0)

    if (len(next_day_data.test_metadata) > 0 and metadata_value
            in next_day_data.test_metadata[id_key_name].values):
        test_indices = list(next_day_data.test_metadata[
                                id_key_name].values).index(metadata_value)

        next_day_data.test_metadata = (
            next_day_data.test_metadata.drop(
                next_day_data.test_metadata.index[test_indices]))
        next_day_data.test_labels = np.delete(
            next_day_data.test_labels, test_indices, axis=0)
        next_day_data.test_features = np.delete(
            next_day_data.test_features, test_indices, axis=0)
    return next_day_data
Exemple #21
0
def learn_loop(nloops: int, strategy: str, path_to_features: str,
               output_diag_file: str, output_queried_file: str,
               features_method='Bazin', classifier='RandomForest',
               training='original', batch=1, screen=True, survey='DES',
               perc=0.1, nclass=2):
    """Perform the active learning loop. All results are saved to file.

    Parameters
    ----------
    nloops: int
        Number of active learning loops to run.
    strategy: str
        Query strategy. Options are 'UncSampling' and 'RandomSampling'.
    path_to_features: str or dict
        Complete path to input features file.
        if dict, keywords should be 'train' and 'test', 
        and values must contain the path for separate train 
        and test sample files.
    output_diag_file: str
        Full path to output file to store diagnostics of each loop.
    output_queried_file: str
        Full path to output file to store the queried sample.
    features_method: str (optional)
        Feature extraction method. Currently only 'Bazin' is implemented.
    classifier: str (optional)
        Machine Learning algorithm.
        Currently only 'RandomForest' is implemented.
    training: str or int (optional)
        Choice of initial training sample.
        If 'original': begin from the train sample flagged in the file
        If int: choose the required number of samples at random,
        ensuring that at least half are SN Ia
        Default is 'original'.
    batch: int (optional)
        Size of batch to be queried in each loop. Default is 1.
    screen: bool (optional)
        If True, print on screen number of light curves processed.
    survey: str (optional)
        'DES' or 'LSST'. Default is 'DES'.
        Name of the survey which characterizes filter set.
    perc: float in [0,1] (optioal)
        Percentile chosen to identify the new query. 
        Only used for PercentileSampling. 
        Default is 0.1.
    nclass: int (optional)
        Number of classes to consider in the classification
        Currently only nclass == 2 is implemented.    
    """

    ## This module will need to be expanded for RESSPECT

    # initiate object
    data = DataBase()

    # load features
    if isinstance(path_to_features, str):
        data.load_features(path_to_features, method=features_method,
                           screen=screen, survey=survey)

        # separate training and test samples
        data.build_samples(initial_training=training, nclass=nclass)

    else:
        data.load_features(path_to_features['train'], method=features_method,
                           screen=screen, survey=survey, sample='train')
        data.load_features(path_to_features['test'], method=features_method,
                           screen=screen, survey=survey, sample='test')

        data.build_samples(initial_training=training, nclass=nclass,
                           screen=screen, sep_files=True)
        
    for loop in range(nloops):

        if screen:
            print('Processing... ', loop)

        # classify
        data.classify(method=classifier)

        # calculate metrics
        data.evaluate_classification()

        # choose object to query
        indx = data.make_query(strategy=strategy, batch=batch, perc=perc)

        # update training and test samples
        data.update_samples(indx, loop=loop)

        # save diagnostics for current state
        data.save_metrics(loop=loop, output_metrics_file=output_diag_file,
                          batch=batch, epoch=loop)

        # save query sample to file
        data.save_queried_sample(output_queried_file, loop=loop,
                                 full_sample=False)
Exemple #22
0
def learn_loop(nloops: int,
               strategy: str,
               path_to_features: str,
               output_metrics_file: str,
               output_queried_file: str,
               features_method: str = 'Bazin',
               classifier: str = 'RandomForest',
               training: str = 'original',
               batch: int = 1,
               survey: str = 'DES',
               nclass: int = 2,
               photo_class_thr: float = 0.5,
               photo_ids_to_file: bool = False,
               photo_ids_froot: str = ' ',
               classifier_bootstrap: bool = False,
               save_predictions: bool = False,
               sep_files=False,
               pred_dir: str = None,
               queryable: bool = False,
               metric_label: str = 'snpcc',
               save_alt_class: bool = False,
               SNANA_types: bool = False,
               metadata_fname: str = None,
               bar: bool = True,
               **kwargs):
    """
    Perform the active learning loop. All results are saved to file.

    Parameters
    ----------
    nloops: int
        Number of active learning loops to run.
    strategy: str
        Query strategy. Options are 'UncSampling', 'RandomSampling',
        'UncSamplingEntropy', 'UncSamplingLeastConfident', 'UncSamplingMargin',
        'QBDMI' and 'QBDEntropy'.
    path_to_features: str or dict
        Complete path to input features file.
        if dict, keywords should be 'train' and 'test',
        and values must contain the path for separate train
        and test sample files.
    output_metrics_file: str
        Full path to output file to store metric values of each loop.
    output_queried_file: str
        Full path to output file to store the queried sample.
    features_method: str (optional)
        Feature extraction method. Currently only 'Bazin' is implemented.
    classifier: str (optional)
        Machine Learning algorithm.
        Currently implemented options are 'RandomForest', 'GradientBoostedTrees',
        'K-NNclassifier','MLPclassifier','SVMclassifier' and 'NBclassifier'.
        Default is 'RandomForest'.
    sep_files: bool (optional)
        If True, consider train and test samples separately read
        from independent files. Default is False.
    batch: int (optional)
        Size of batch to be queried in each loop. Default is 1.
    classifier_bootstrap: bool (optional)
        Flag for bootstrapping on the classifier
        Must be true if using disagreement based strategy.
    metadata_fname: str (optional)
        Complete path to PLAsTiCC zenodo test metadata. Only used it
        SNANA_types == True. Default is None.
    metric_label: str (optional)
        Choice of metric.
        Currently only "snpcc", "cosmo" or "snpcc_cosmo" are accepted.
        Default is "snpcc".
    nclass: int (optional)
        Number of classes to consider in the classification
        Currently only nclass == 2 is implemented.
    photo_class_thr: float (optional)
        Threshold for photometric classification. Default is 0.5.
        Only used if photo_ids is True.
    photo_ids_to_file: bool (optional)
        If True, save photometric ids to file. Default is False.
    photo_ids_froot: str (optional)
        Output root of file name to store photo ids.
        Only used if photo_ids is True.
    pred_dir: str (optional)
        Output diretory to store prediction file for each loop.
        Only used if `save_predictions==True`.
    queryable: bool (optional)
        If True, check if randomly chosen object is queryable.
        Default is False.
    save_alt_class: bool (optional)
        If True, train the model and save classifications for alternative
        query label (this is necessary to calculate impact on cosmology).
        Default is False.
    save_predictions: bool (optional)
        If True, save classification predictions to file in each loop.
        Default is False.
    SNANA_types: bool (optional)
        If True, translate zenodo types to SNANA codes.
        Default is False.
    survey: str (optional)
        'DES' or 'LSST'. Default is 'DES'.
        Name of the survey which characterizes filter set.
    training: str or int (optional)
        Choice of initial training sample.
        If 'original': begin from the train sample flagged in the file
        If int: choose the required number of samples at random,
        ensuring that at least half are SN Ia
        Default is 'original'.
    bar: bool (optional)
        If True, display progress bar.     
    kwargs: extra parameters
        All keywords required by the classifier function.
    """
    if 'QBD' in strategy and not classifier_bootstrap:
        raise ValueError(
            'Bootstrap must be true when using disagreement strategy')

    # initiate object
    database_class = DataBase()
    logging.info('Loading features')
    database_class = load_features(database_class, path_to_features, survey,
                                   features_method, nclass, training,
                                   queryable, sep_files)

    logging.info('Running active learning loop')

    if bar:
        ensemble = progressbar.progressbar(range(nloops))
    else:
        ensemble = range(nloops)

    for iteration_step in ensemble:
        if not bar:
            print(iteration_step)

        database_class = run_classification(database_class, classifier,
                                            classifier_bootstrap, pred_dir,
                                            save_predictions, iteration_step,
                                            **kwargs)
        run_evaluation(database_class, metric_label)
        save_photo_ids(database_class, photo_ids_to_file, SNANA_types,
                       metadata_fname, photo_class_thr, iteration_step,
                       photo_ids_froot, '.dat')
        indices_to_query = run_make_query(database_class, strategy, batch,
                                          queryable)
        if save_alt_class and batch == 1:
            database_class_alternative = copy.deepcopy(database_class)
            database_class_alternative = update_alternative_label(
                database_class_alternative, indices_to_query, iteration_step,
                classifier, pred_dir, save_predictions, metric_label,
                SNANA_types, photo_ids_to_file, metadata_fname,
                photo_class_thr, photo_ids_froot, **kwargs)
            _save_metrics_and_queried_samples(database_class_alternative,
                                              output_metrics_file,
                                              output_queried_file,
                                              iteration_step, batch, False,
                                              '_alt_label.dat')

        elif save_alt_class and batch > 1:
            raise ValueError('Alternative label only works with batch=1!')

        database_class.update_samples(indices_to_query,
                                      epoch=iteration_step,
                                      queryable=queryable,
                                      alternative_label=False)
        _save_metrics_and_queried_samples(database_class, output_metrics_file,
                                          output_queried_file, iteration_step,
                                          batch, False)
    return database_class
Exemple #23
0
def load_features(database_class: DataBase,
                  path_to_features: Union[str, dict],
                  survey: str,
                  features_method: str,
                  number_of_classes: int,
                  training_method: str,
                  is_queryable: bool,
                  separate_files: bool = False,
                  save_samples: bool = False) -> DataBase:
    """
    Load features according to feature extraction method

    Parameters
    ----------
    database_class
        An instance of DataBase class
    path_to_features
        Complete path to input features file.
        if dict, keywords should be 'train' and 'test',
        and values must contain the path for separate train
        and test sample files.
    survey
       'DES' or 'LSST'. Default is 'DES'.
        Name of the survey which characterizes filter set.
    features_method
        Feature extraction method. Currently only 'Bazin' is implemented.
    number_of_classes
        Number of classes to consider in the classification
        Currently only nclass == 2 is implemented.
    training_method
        Choice of initial training sample.
        If 'original': begin from the train sample flagged in the file
        If int: choose the required number of samples at random,
        ensuring that at least half are SN Ia
        Default is 'original'.
    is_queryable
       If True, check if randomly chosen object is queryable.
       Default is False.
    separate_files: bool (optional)
        If True, consider train and test samples separately read
        from independent files. Default is False.
    save_samples: bool (optional)
        If True, save training and test samples to file.
        Default is False.
    """
    if isinstance(path_to_features, str):
        database_class.load_features(path_to_file=path_to_features,
                                     method=features_method,
                                     survey=survey)
    else:
        features_set_names = ['train', 'test', 'validation', 'pool']
        for sample_name in features_set_names:
            if sample_name in path_to_features.keys():
                database_class.load_features(path_to_features[sample_name],
                                             method=features_method,
                                             survey=survey,
                                             sample=sample_name)
            else:
                logging.warning(f'Path to {sample_name} not given.'
                                f' Proceeding without this sample')

    database_class.build_samples(initial_training=training_method,
                                 nclass=number_of_classes,
                                 queryable=is_queryable,
                                 sep_files=separate_files,
                                 save_samples=save_samples)

    return database_class
Exemple #24
0
def update_alternative_label(
        database_class_alternative: DataBase, indices_to_query: list,
        iteration_step: int, classifier: str, pred_dir: str,
        is_save_prediction: bool, metric_label: str, is_save_snana_types: bool,
        is_save_photoids_to_file: bool, meta_data_fname: str,
        photo_class_threshold: float, photo_ids_froot: str, **kwargs: dict):
    """
    Function to update active learning training with alternative label

    Parameters
    ----------
    database_class_alternative
        An instance of DataBase class for alternative label
    indices_to_query
        List of indexes identifying objects to be moved.
    iteration_step
        active learning iteration number
    classifier
        Machine Learning algorithm.
        Currently implemented options are 'RandomForest', 'GradientBoostedTrees',
        'K-NNclassifier','MLPclassifier','SVMclassifier' and 'NBclassifier'.
        Default is 'RandomForest'.
    pred_dir
        Output diretory to store prediction file for each loop.
        Only used if `save_predictions==True`.
    is_save_prediction
        if predictions should be saved
    metric_label
        Choice of metric.
        Currently only "snpcc", "cosmo" or "snpcc_cosmo" are accepted.
        Default is "snpcc".
    is_save_snana_types
        if True, translate type to SNANA codes and
        add column with original values. Default is False.
    is_save_photoids_to_file
        If true, populate the photo_Ia_list attribute. Otherwise
        write to file. Default is False.
    meta_data_fname
        Full path to PLAsTiCC zenodo test metadata file.
    photo_class_threshold
         Probability threshold above which an object is considered Ia.
    photo_ids_froot
        Output root of file name to store photo ids.
        Only used if photo_ids is True.
    kwargs
        additional arguments
    """
    database_class_alternative.update_samples(indices_to_query,
                                              epoch=iteration_step,
                                              alternative_label=True)

    database_class_alternative = run_classification(database_class_alternative,
                                                    classifier, False,
                                                    pred_dir,
                                                    is_save_prediction,
                                                    iteration_step, **kwargs)

    run_evaluation(database_class_alternative, metric_label)

    save_photo_ids(database_class_alternative, is_save_photoids_to_file,
                   is_save_snana_types, meta_data_fname, photo_class_threshold,
                   iteration_step, photo_ids_froot, '_alt_label.dat')

    return database_class_alternative
Exemple #25
0
def load_dataset(file_names_dict: dict, survey_name: str = 'DES',
                 initial_training: Union[str, int] = 'original',
                 ia_training_fraction: float = 0.5, is_queryable: bool = False,
                 is_separate_files: bool = False, samples_list: list = [None],
                 is_load_build_samples: bool = True,
                 number_of_classes: int = 2,
                 feature_extraction_method: str = 'Bazin',
                 is_save_samples: bool = False) -> DataBase:
    """
    Reads a data sample from file.

    Parameters
    ----------
    file_names_dict:  dict
        Path to light curve features file.
        #if "sep_files == True", dictionary keywords must contain identify
        #different samples: ['train', 'test','validation', 'pool',  None]
    ia_training_fraction: float in [0,1] (optional)
        Fraction of Ia required in initial training sample.
        Only used if "initial_training" is a number. Default is 0.5.
    initial_training: str or int (optional)
        Choice of initial training sample.
        If 'original': begin from the train sample flagged in the file
        elif int: choose the required number of samples at random,
        ensuring that at least "ia_frac" are SN Ia.
        Default is 'original'.
    is_queryable: bool (optional)
        If True, allow queries only on objects flagged as queryable.
        Default is True.
    is_separate_files: bool (optional)
            If True, consider samples separately read
            from independent files. Default is False.
    survey_name: str (optional)
        Name of survey to be analyzed. Accepts 'DES' or 'LSST'.
        Default is DES.
    samples_list: list (optional)
        If None, sample is given by a column within the given file.
        else, read independent files for 'train' and 'test'.
        Default is None.
    number_of_classes
        Number of classes to consider in the classification
        Currently only nclass == 2 is implemented.
    feature_extraction_method: str (optional)
        Feature extraction method. The current implementation only
        accepts method=='Bazin' or 'photometry'.
        Default is 'Bazin'.
    is_save_samples: bool (optional)
        If True, save training and test samples to file.
        Default is False.
    is_load_build_samples
        if database.build_samples method should be called
    """

    # initiate object
    database_class = DataBase()
    for sample in samples_list:
        database_class.load_features(
            file_names_dict[sample], survey=survey_name, sample=sample,
            method=feature_extraction_method)
    if is_load_build_samples:
        database_class.build_samples(
            initial_training=initial_training, nclass=number_of_classes,
            Ia_frac=ia_training_fraction, queryable=is_queryable,
            save_samples=is_save_samples, sep_files=is_separate_files,
            survey=survey_name)
    return database_class
Exemple #26
0
def _update_light_curve_data_for_next_epoch(
        light_curve_data: DataBase, next_day_data: DataBase,
        canonical_data: DataBase, is_queryable: bool, strategy: str,
        is_separate_files: bool) -> DataBase:
    """
    Updates samples for next epoch

    Parameters
    ----------
    light_curve_data
        light curve learning data
    next_day_data
        next day light curve data
    canonical_data
        canonical strategy light curve data
    is_queryable
        If True, allow queries only on objects flagged as queryable.
        Default is True.
    strategy
        Query strategy. Options are (all can be run with budget):
        "UncSampling", "UncSamplingEntropy", "UncSamplingLeastConfident",
        "UncSamplingMargin", "QBDMI", "QBDEntropy", "RandomSampling",
    is_separate_files
        If True, consider samples separately read
        from independent files. Default is False.
    """
    light_curve_data.pool_metadata = next_day_data.pool_metadata
    light_curve_data.pool_features = next_day_data.pool_features
    light_curve_data.pool_labels = next_day_data.pool_labels

    if not is_separate_files:
        light_curve_data.test_metadata = next_day_data.test_metadata
        light_curve_data.test_features = next_day_data.test_features
        light_curve_data.test_labels = next_day_data.test_labels

        light_curve_data.validation_metadata = next_day_data.validation_metadata
        light_curve_data.validation_features = next_day_data.validation_features
        light_curve_data.validation_labels = next_day_data.validation_labels

    if strategy == 'canonical':
        light_curve_data.queryable_ids = canonical_data.queryable_ids

    if is_queryable:
        queryable_flag = light_curve_data.pool_metadata['queryable'].values
        light_curve_data.queryable_ids = light_curve_data.pool_metadata[
            'id'].values[queryable_flag]
    else:
        light_curve_data.queryable_ids = light_curve_data.pool_metadata[
            'id'].values
    return light_curve_data
Exemple #27
0
def time_domain_loop(days: list,  output_diag_file: str,
                     output_queried_file: str,
                     path_to_features_dir: str, strategy: str,
                     batch=1, canonical = False,  classifier='RandomForest',
                     features_method='Bazin', path_to_canonical="",
                     path_to_full_lc_features="",
                     screen=True, training='original'):
    """Perform the active learning loop. All results are saved to file.

    Parameters
    ----------
    days: list
        List of 2 elements. First and last day of observations since the
        beginning of the survey.
    output_diag_file: str
        Full path to output file to store diagnostics of each loop.
    output_queried_file: str
        Full path to output file to store the queried sample.
    path_to_features_dir: str
        Complete path to directory holding features files for all days.
    strategy: str
        Query strategy. Options are 'UncSampling' and 'RandomSampling'.
    batch: int (optional)
        Size of batch to be queried in each loop. Default is 1.
    canonical: bool (optional)
        If True, restrict the search to the canonical sample.
    classifier: str (optional)
        Machine Learning algorithm.
        Currently only 'RandomForest' is implemented.
    features_method: str (optional)
        Feature extraction method. Currently only 'Bazin' is implemented.
    path_to_canonical: str (optional)
        Path to canonical sample features files.
        It is only used if "strategy==canonical".
    path_to_full_lc_features: str (optional)
        Path to full light curve features file.
        Only used if training is a number.
    screen: bool (optional)
        If True, print on screen number of light curves processed.
    training: str or int (optional)
        Choice of initial training sample.
        If 'original': begin from the train sample flagged in the file
        If int: choose the required number of samples at random,
        ensuring that at least half are SN Ia
        Default is 'original'.

    """

    ## This will need to change for RESSPECT

    # initiate object
    data = DataBase()

    # load features for the first day
    path_to_features = path_to_features_dir + 'day_' + str(int(days[0])) + '.dat'
    data.load_features(path_to_features, method=features_method,
                       screen=screen)

    # change training
    if training == 'original':
        data.build_samples(initial_training='original')
        full_lc_features = get_original_training(path_to_features=path_to_full_lc_features)
        data.train_metadata = full_lc_features.train_metadata
        data.train_labels = full_lc_features.train_labels
        data.train_features = full_lc_features.train_features

    else:
        data.build_samples(initial_training=int(training))

    # get list of canonical ids
    if canonical:
        canonical = DataBase()
        canonical.load_features(path_to_file=path_to_canonical)
        data.queryable_ids = canonical.queryable_ids


    for night in range(int(days[0]), int(days[-1]) - 1):

        if screen:
            print('Processing night: ', night)

        # cont loop
        loop = night - int(days[0])

        # classify
        data.classify(method=classifier)

        # calculate metrics
        data.evaluate_classification()

        # choose object to query
        indx = data.make_query(strategy=strategy, batch=batch)

        # update training and test samples
        data.update_samples(indx, loop=loop)

        # save diagnostics for current state
        data.save_metrics(loop=loop, output_metrics_file=output_diag_file,
                          batch=batch, epoch=night)

        # save query sample to file
        data.save_queried_sample(output_queried_file, loop=loop,
                                 full_sample=False)

        # load features for next day
        path_to_features2 = path_to_features_dir + 'day_' + str(night + 1) + '.dat'

        data_tomorrow = DataBase()
        data_tomorrow.load_features(path_to_features2, method=features_method,
                                    screen=False)

        # identify objects in the new day which must be in training
        train_flag = np.array([item in data.train_metadata['id'].values 
                              for item in data_tomorrow.metadata['id'].values])
   
        # use new data        
        data.train_metadata = data_tomorrow.metadata[train_flag]
        data.train_features = data_tomorrow.features.values[train_flag]
        data.test_metadata = data_tomorrow.metadata[~train_flag]
        data.test_features = data_tomorrow.features.values[~train_flag]

        # new labels
        data.train_labels = np.array([int(item  == 'Ia') for item in 
                                     data.train_metadata['type'].values])
        data.test_labels = np.array([int(item == 'Ia') for item in 
                                    data.test_metadata['type'].values])

        if strategy == 'canonical':
            data.queryable_ids = canonical.queryable_ids

        if  queryable:
            queryable_flag = data_tomorrow.metadata['queryable'].values
            queryable_test_flag = np.logical_and(~train_flag, queryable_flag)
            data.queryable_ids = data_tomorrow.metadata['id'].values[queryable_test_flag]
        else:
            data.queryable_ids = data_tomorrow.metadata['id'].values[~train_flag]

        if screen:
            print('Training set size: ', data.train_metadata.shape[0])
            print('Test set size: ', data.test_metadata.shape[0])