def _update_canonical_ids(light_curve_data: DataBase, canonical_file_name: str, is_restrict_canonical: bool) -> Tuple[ DataBase, DataBase]: """ Updates canonical ids Parameters ---------- light_curve_data initial light curve training data canonical_file_name Path to canonical sample features files. It is only used if "strategy==canonical". is_restrict_canonical If True, restrict the search to the canonical sample. """ database_class = None if is_restrict_canonical: database_class = DataBase() database_class.load_features(path_to_file=canonical_file_name) light_curve_data.queryable_ids = database_class.queryable_ids return light_curve_data, database_class
def load_features(database_class: DataBase, path_to_features: Union[str, dict], survey: str, features_method: str, number_of_classes: int, training_method: str, is_queryable: bool, separate_files: bool = False, save_samples: bool = False) -> DataBase: """ Load features according to feature extraction method Parameters ---------- database_class An instance of DataBase class path_to_features Complete path to input features file. if dict, keywords should be 'train' and 'test', and values must contain the path for separate train and test sample files. survey 'DES' or 'LSST'. Default is 'DES'. Name of the survey which characterizes filter set. features_method Feature extraction method. Currently only 'Bazin' is implemented. number_of_classes Number of classes to consider in the classification Currently only nclass == 2 is implemented. training_method Choice of initial training sample. If 'original': begin from the train sample flagged in the file If int: choose the required number of samples at random, ensuring that at least half are SN Ia Default is 'original'. is_queryable If True, check if randomly chosen object is queryable. Default is False. separate_files: bool (optional) If True, consider train and test samples separately read from independent files. Default is False. save_samples: bool (optional) If True, save training and test samples to file. Default is False. """ if isinstance(path_to_features, str): database_class.load_features(path_to_file=path_to_features, method=features_method, survey=survey) else: features_set_names = ['train', 'test', 'validation', 'pool'] for sample_name in features_set_names: if sample_name in path_to_features.keys(): database_class.load_features(path_to_features[sample_name], method=features_method, survey=survey, sample=sample_name) else: logging.warning(f'Path to {sample_name} not given.' f' Proceeding without this sample') database_class.build_samples(initial_training=training_method, nclass=number_of_classes, queryable=is_queryable, sep_files=separate_files, save_samples=save_samples) return database_class
def learn_loop(nloops: int, strategy: str, path_to_features: str, output_diag_file: str, output_queried_file: str, features_method='Bazin', classifier='RandomForest', training='original', batch=1, screen=True, survey='DES', perc=0.1, nclass=2): """Perform the active learning loop. All results are saved to file. Parameters ---------- nloops: int Number of active learning loops to run. strategy: str Query strategy. Options are 'UncSampling' and 'RandomSampling'. path_to_features: str or dict Complete path to input features file. if dict, keywords should be 'train' and 'test', and values must contain the path for separate train and test sample files. output_diag_file: str Full path to output file to store diagnostics of each loop. output_queried_file: str Full path to output file to store the queried sample. features_method: str (optional) Feature extraction method. Currently only 'Bazin' is implemented. classifier: str (optional) Machine Learning algorithm. Currently only 'RandomForest' is implemented. training: str or int (optional) Choice of initial training sample. If 'original': begin from the train sample flagged in the file If int: choose the required number of samples at random, ensuring that at least half are SN Ia Default is 'original'. batch: int (optional) Size of batch to be queried in each loop. Default is 1. screen: bool (optional) If True, print on screen number of light curves processed. survey: str (optional) 'DES' or 'LSST'. Default is 'DES'. Name of the survey which characterizes filter set. perc: float in [0,1] (optioal) Percentile chosen to identify the new query. Only used for PercentileSampling. Default is 0.1. nclass: int (optional) Number of classes to consider in the classification Currently only nclass == 2 is implemented. """ ## This module will need to be expanded for RESSPECT # initiate object data = DataBase() # load features if isinstance(path_to_features, str): data.load_features(path_to_features, method=features_method, screen=screen, survey=survey) # separate training and test samples data.build_samples(initial_training=training, nclass=nclass) else: data.load_features(path_to_features['train'], method=features_method, screen=screen, survey=survey, sample='train') data.load_features(path_to_features['test'], method=features_method, screen=screen, survey=survey, sample='test') data.build_samples(initial_training=training, nclass=nclass, screen=screen, sep_files=True) for loop in range(nloops): if screen: print('Processing... ', loop) # classify data.classify(method=classifier) # calculate metrics data.evaluate_classification() # choose object to query indx = data.make_query(strategy=strategy, batch=batch, perc=perc) # update training and test samples data.update_samples(indx, loop=loop) # save diagnostics for current state data.save_metrics(loop=loop, output_metrics_file=output_diag_file, batch=batch, epoch=loop) # save query sample to file data.save_queried_sample(output_queried_file, loop=loop, full_sample=False)
def time_domain_loop(days: list, output_diag_file: str, output_queried_file: str, path_to_features_dir: str, strategy: str, batch=1, canonical = False, classifier='RandomForest', features_method='Bazin', path_to_canonical="", path_to_full_lc_features="", screen=True, training='original'): """Perform the active learning loop. All results are saved to file. Parameters ---------- days: list List of 2 elements. First and last day of observations since the beginning of the survey. output_diag_file: str Full path to output file to store diagnostics of each loop. output_queried_file: str Full path to output file to store the queried sample. path_to_features_dir: str Complete path to directory holding features files for all days. strategy: str Query strategy. Options are 'UncSampling' and 'RandomSampling'. batch: int (optional) Size of batch to be queried in each loop. Default is 1. canonical: bool (optional) If True, restrict the search to the canonical sample. classifier: str (optional) Machine Learning algorithm. Currently only 'RandomForest' is implemented. features_method: str (optional) Feature extraction method. Currently only 'Bazin' is implemented. path_to_canonical: str (optional) Path to canonical sample features files. It is only used if "strategy==canonical". path_to_full_lc_features: str (optional) Path to full light curve features file. Only used if training is a number. screen: bool (optional) If True, print on screen number of light curves processed. training: str or int (optional) Choice of initial training sample. If 'original': begin from the train sample flagged in the file If int: choose the required number of samples at random, ensuring that at least half are SN Ia Default is 'original'. """ ## This will need to change for RESSPECT # initiate object data = DataBase() # load features for the first day path_to_features = path_to_features_dir + 'day_' + str(int(days[0])) + '.dat' data.load_features(path_to_features, method=features_method, screen=screen) # change training if training == 'original': data.build_samples(initial_training='original') full_lc_features = get_original_training(path_to_features=path_to_full_lc_features) data.train_metadata = full_lc_features.train_metadata data.train_labels = full_lc_features.train_labels data.train_features = full_lc_features.train_features else: data.build_samples(initial_training=int(training)) # get list of canonical ids if canonical: canonical = DataBase() canonical.load_features(path_to_file=path_to_canonical) data.queryable_ids = canonical.queryable_ids for night in range(int(days[0]), int(days[-1]) - 1): if screen: print('Processing night: ', night) # cont loop loop = night - int(days[0]) # classify data.classify(method=classifier) # calculate metrics data.evaluate_classification() # choose object to query indx = data.make_query(strategy=strategy, batch=batch) # update training and test samples data.update_samples(indx, loop=loop) # save diagnostics for current state data.save_metrics(loop=loop, output_metrics_file=output_diag_file, batch=batch, epoch=night) # save query sample to file data.save_queried_sample(output_queried_file, loop=loop, full_sample=False) # load features for next day path_to_features2 = path_to_features_dir + 'day_' + str(night + 1) + '.dat' data_tomorrow = DataBase() data_tomorrow.load_features(path_to_features2, method=features_method, screen=False) # identify objects in the new day which must be in training train_flag = np.array([item in data.train_metadata['id'].values for item in data_tomorrow.metadata['id'].values]) # use new data data.train_metadata = data_tomorrow.metadata[train_flag] data.train_features = data_tomorrow.features.values[train_flag] data.test_metadata = data_tomorrow.metadata[~train_flag] data.test_features = data_tomorrow.features.values[~train_flag] # new labels data.train_labels = np.array([int(item == 'Ia') for item in data.train_metadata['type'].values]) data.test_labels = np.array([int(item == 'Ia') for item in data.test_metadata['type'].values]) if strategy == 'canonical': data.queryable_ids = canonical.queryable_ids if queryable: queryable_flag = data_tomorrow.metadata['queryable'].values queryable_test_flag = np.logical_and(~train_flag, queryable_flag) data.queryable_ids = data_tomorrow.metadata['id'].values[queryable_test_flag] else: data.queryable_ids = data_tomorrow.metadata['id'].values[~train_flag] if screen: print('Training set size: ', data.train_metadata.shape[0]) print('Test set size: ', data.test_metadata.shape[0])
def load_dataset(file_names_dict: dict, survey_name: str = 'DES', initial_training: Union[str, int] = 'original', ia_training_fraction: float = 0.5, is_queryable: bool = False, is_separate_files: bool = False, samples_list: list = [None], is_load_build_samples: bool = True, number_of_classes: int = 2, feature_extraction_method: str = 'Bazin', is_save_samples: bool = False) -> DataBase: """ Reads a data sample from file. Parameters ---------- file_names_dict: dict Path to light curve features file. #if "sep_files == True", dictionary keywords must contain identify #different samples: ['train', 'test','validation', 'pool', None] ia_training_fraction: float in [0,1] (optional) Fraction of Ia required in initial training sample. Only used if "initial_training" is a number. Default is 0.5. initial_training: str or int (optional) Choice of initial training sample. If 'original': begin from the train sample flagged in the file elif int: choose the required number of samples at random, ensuring that at least "ia_frac" are SN Ia. Default is 'original'. is_queryable: bool (optional) If True, allow queries only on objects flagged as queryable. Default is True. is_separate_files: bool (optional) If True, consider samples separately read from independent files. Default is False. survey_name: str (optional) Name of survey to be analyzed. Accepts 'DES' or 'LSST'. Default is DES. samples_list: list (optional) If None, sample is given by a column within the given file. else, read independent files for 'train' and 'test'. Default is None. number_of_classes Number of classes to consider in the classification Currently only nclass == 2 is implemented. feature_extraction_method: str (optional) Feature extraction method. The current implementation only accepts method=='Bazin' or 'photometry'. Default is 'Bazin'. is_save_samples: bool (optional) If True, save training and test samples to file. Default is False. is_load_build_samples if database.build_samples method should be called """ # initiate object database_class = DataBase() for sample in samples_list: database_class.load_features( file_names_dict[sample], survey=survey_name, sample=sample, method=feature_extraction_method) if is_load_build_samples: database_class.build_samples( initial_training=initial_training, nclass=number_of_classes, Ia_frac=ia_training_fraction, queryable=is_queryable, save_samples=is_save_samples, sep_files=is_separate_files, survey=survey_name) return database_class