def test_trainingset(tsetkey, linfit): # Get training set class using conv. function: tsetclass = get_trainingset(tsetkey) for testfraction in (0, 0.2): tset = tsetclass(tf=testfraction, linfit=linfit) print(tset) if linfit: assert tset.key == tsetkey + '-linfit' else: assert tset.key == tsetkey assert tset.level == 'L1' assert tset.datalevel == 'corr' assert tset.testfraction == testfraction assert len(tset) > 0 # Invalid level should give ValueError: with pytest.raises(ValueError): tsetclass(level='nonsense') # Test-fractions which should all return in a ValueError: with pytest.raises(ValueError): tset = tsetclass(tf=1.2) with pytest.raises(ValueError): tset = tsetclass(tf=1.0) with pytest.raises(ValueError): tset = tsetclass(tf=-0.2) # Calling with invalid datalevel should throw an error as well: with pytest.raises(ValueError): tset = tsetclass(datalevel='nonsense') tset = tsetclass(tf=0, linfit=linfit) print(tset) lbls = tset.labels() lbls_test = tset.labels_test() print(tset.nobjects) print(len(lbls), len(lbls_test)) assert len(lbls) == tset.nobjects assert len(lbls_test) == 0 tset = tsetclass(tf=0.2, linfit=linfit) print(tset) lbls = tset.labels() lbls_test = tset.labels_test() print(tset.nobjects) print(len(lbls), len(lbls_test)) assert len(lbls) + len(lbls_test) == tset.nobjects
def test_trainingset_generate_todolist(monkeypatch, tsetkey, linfit): # Get training set class using conv. function: tsetclass = get_trainingset(tsetkey) tset = tsetclass(linfit=linfit) input_folder = tset.input_folder print("Training Set input folder: %s" % input_folder) with tempfile.TemporaryDirectory(prefix='pytest-private-tsets-') as tmpdir: # Create a copy of the root fies of the trainings set (ignore that actual data) # in the temp. directory: tsetdir = os.path.join(tmpdir, os.path.basename(input_folder)) print("New dummy input folder: %s" % tsetdir) os.makedirs(tsetdir) for f in os.listdir(input_folder): fpath = os.path.join(input_folder, f) if os.path.isfile(fpath) and not f.endswith(('.sqlite', '.sqlite-journal')): shutil.copy(fpath, tsetdir) elif os.path.isdir(fpath) and not f.startswith('features_cache'): # NOTE: We are cheating, and creating an empty file with # the correct name, since the file is actually not # needed for building the todolist, it only needs to exist. os.makedirs(os.path.join(tsetdir, f)) for subf in os.listdir(fpath): open(os.path.join(tsetdir, f, subf), 'w').close() # Create a fake features_cache directory, which just contain one dummy file: new_featdir = os.path.join(tsetdir, os.path.basename(tset.features_cache)) os.makedirs(new_featdir) open(os.path.join(new_featdir, 'dummy.txt'), 'w').close() # Change the environment variable to the temp. dir: monkeypatch.setenv("STARCLASS_TSETS", tmpdir) print(os.environ['STARCLASS_TSETS']) # When we now initialize the trainingset it should run generate_todo automatically: tset = tsetclass(linfit=linfit) # Check that the todo-file was indeed created: assert tset.input_folder == tsetdir assert os.path.isfile(os.path.join(tsetdir, tset._todo_name + '.sqlite')) # Make sure that the dummy features_cache dir was created and picked up: assert os.path.isdir(tset.features_cache) assert os.listdir(tset.features_cache) == ['dummy.txt'] # Let's clear the features cache: tset.clear_cache() # Now the features_cache directory should be gone: assert not os.path.exists(tset.features_cache), "features_cache still exists"
def test_trainingset_features(tsetkey, linfit): # Get training set class using conv. function: tsetclass = get_trainingset(tsetkey) tset = tsetclass(tf=0.2, linfit=linfit) features = tset.features() assert isinstance(features, types.GeneratorType) features_test = tset.features_test() assert isinstance(features_test, types.GeneratorType) for tries in range(2): feat = next(features) print(feat) assert isinstance(feat, dict) assert 'lightcurve' in feat assert 'powerspectrum' in feat assert 'frequencies' in feat
def test_trainingset_folds(tsetkey, linfit): # Get training set class using conv. function: tsetclass = get_trainingset(tsetkey) tset = tsetclass(linfit=linfit) for k, fold in enumerate(tset.folds(n_splits=5, tf=0.2)): assert isinstance(fold, tsetclass) assert fold.key == tset.key assert fold.crossval_folds == 5 assert fold.fold == k + 1 assert fold.testfraction == 0.2 assert fold.level == tset.level assert fold.random_seed == tset.random_seed assert len(fold.train_idx) > 0 assert len(fold.test_idx) > 0 assert len(fold.train_idx) > len(fold.test_idx) assert len(fold.train_idx) < len(tset.train_idx) assert k == 4, "Not the correct number of folds"
def main(): # Parse command line arguments: parser = argparse.ArgumentParser( description='Command-line interface for running stellar classifiers.') parser.add_argument('-d', '--debug', help='Print debug messages.', action='store_true') parser.add_argument('-q', '--quiet', help='Only report warnings and errors.', action='store_true') parser.add_argument('-o', '--overwrite', help='Overwrite existing results.', action='store_true') parser.add_argument( '--clear-cache', help= 'Clear existing features cache tables before running. Can only be used together with --overwrite.', action='store_true') # Option to select which classifier to run: parser.add_argument( '-c', '--classifier', default=None, choices=starclass.classifier_list, metavar='{CLASSIFIER}', help= 'Classifier to run. Default is to run all classifiers. Choises are ' + ", ".join(starclass.classifier_list) + '.') # Option to select training set: parser.add_argument( '-t', '--trainingset', default='keplerq9v3', choices=starclass.trainingset_list, metavar='{TSET}', help='Train classifier using this training-set. Choises are ' + ", ".join(starclass.trainingset_list) + '.') parser.add_argument('-l', '--level', help='Classification level.', default='L1', choices=('L1', 'L2')) parser.add_argument('--linfit', help='Enable linfit in training set.', action='store_true') #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"? #parser.add_argument('--starid', type=int, help='TIC identifier of target.', nargs='?', default=None) # Lightcurve truncate override switch: group = parser.add_mutually_exclusive_group(required=False) group.add_argument('--truncate', dest='truncate', action='store_true', help='Force light curve truncation.') group.add_argument('--no-truncate', dest='truncate', action='store_false', help='Force no light curve truncation.') parser.set_defaults(truncate=None) # Data directory: parser.add_argument( '--datadir', type=str, default=None, help= 'Directory where trained models and diagnostics will be loaded. Default is to load from the programs data directory.' ) # Input todo-file/directory: parser.add_argument('input_folder', type=str, nargs='?', default=None, help='Input directory to run classification on.') args = parser.parse_args() # Cache tables (MOAT) should not be cleared unless results tables are also cleared. # Otherwise we could end up with non-complete MOAT tables. if args.clear_cache and not args.overwrite: parser.error("--clear-cache can not be used without --overwrite") # Set logging level: logging_level = logging.INFO if args.quiet: logging_level = logging.WARNING elif args.debug: logging_level = logging.DEBUG # Setup logging: formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') console = logging.StreamHandler() console.setFormatter(formatter) logger = logging.getLogger(__name__) logger.addHandler(console) logger.setLevel(logging_level) logger_parent = logging.getLogger('starclass') logger_parent.addHandler(console) logger_parent.setLevel(logging_level) # Get input and output folder from environment variables: input_folder = args.input_folder if input_folder is None: input_folder = os.environ.get('STARCLASS_INPUT') if input_folder is None: parser.error("No input folder specified") if not os.path.exists(input_folder): parser.error("INPUT_FOLDER does not exist") if os.path.isdir(input_folder): todo_file = os.path.join(input_folder, 'todo.sqlite') else: todo_file = os.path.abspath(input_folder) input_folder = os.path.dirname(input_folder) # Choose which classifier to use: # If nothing was specified, run all classifiers, and automatically switch between them: if args.classifier is None: current_classifier = starclass.classifier_list[0] change_classifier = True else: current_classifier = args.classifier change_classifier = False # Initialize training set: tsetclass = starclass.get_trainingset(args.trainingset) tset = tsetclass(level=args.level, linfit=args.linfit) # Running: # When simply running the classifier on new stars: stcl = None with starclass.TaskManager(todo_file, overwrite=args.overwrite, classes=tset.StellarClasses) as tm: # If we were asked to do so, start by clearing the existing MOAT tables: if args.overwrite and args.clear_cache: tm.moat_clear() while True: tasks = tm.get_task(classifier=current_classifier, change_classifier=change_classifier) if tasks is None: break tm.start_task(tasks) # ----------------- This code would run on each worker ------------------------ # Make sure we can loop through tasks, # even in the case we have only gotten one: results = [] if isinstance(tasks, dict): tasks = [tasks] if tasks[0]['classifier'] != current_classifier or stcl is None: current_classifier = tasks[0]['classifier'] if stcl: stcl.close() stcl = starclass.get_classifier(current_classifier) stcl = stcl(tset=tset, features_cache=None, truncate_lightcurves=args.truncate, data_dir=args.datadir) for task in tasks: res = stcl.classify(task) results.append(res) # ----------------- This code would run on each worker ------------------------ # Return to TaskManager to be saved: tm.save_results(results)
def main(): # Parse command line arguments: parser = argparse.ArgumentParser(description='Utility function for running stellar classifiers.') parser.add_argument('-d', '--debug', help='Print debug messages.', action='store_true') parser.add_argument('-q', '--quiet', help='Only report warnings and errors.', action='store_true') parser.add_argument('-o', '--overwrite', help='Overwrite existing results.', action='store_true') parser.add_argument('-c', '--classifier', help='Classifier to use.', default='rfgc', choices=starclass.classifier_list) parser.add_argument('-t', '--trainingset', help='Train classifier using this training-set.', default='keplerq9v3', choices=starclass.trainingset_list) parser.add_argument('--linfit', help='Enable linfit in training set.', action='store_true') parser.add_argument('-l', '--level', help='Classification level', default='L1', choices=('L1', 'L2')) #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"? #parser.add_argument('--starid', type=int, help='TIC identifier of target.', nargs='?', default=None) # Lightcurve truncate override switch: group = parser.add_mutually_exclusive_group(required=False) group.add_argument('--truncate', dest='truncate', action='store_true', help='Force light curve truncation.') group.add_argument('--no-truncate', dest='truncate', action='store_false', help='Force no light curve truncation.') parser.set_defaults(truncate=None) parser.add_argument('input_folder', type=str, help='Input directory to run classification on.', nargs='?', default=None) args = parser.parse_args() # Set logging level: logging_level = logging.INFO if args.quiet: logging_level = logging.WARNING elif args.debug: logging_level = logging.DEBUG # Setup logging: formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') console = logging.StreamHandler() console.setFormatter(formatter) logger = logging.getLogger(__name__) logger.addHandler(console) logger.setLevel(logging_level) logger_parent = logging.getLogger('starclass') logger_parent.addHandler(console) logger_parent.setLevel(logging_level) # Get input and output folder from environment variables: input_folder = args.input_folder if input_folder is None: input_folder = os.environ.get('STARCLASS_INPUT') if input_folder is None: parser.error("No input folder specified") if not os.path.exists(input_folder): parser.error("INPUT_FOLDER does not exist") if os.path.isdir(input_folder): todo_file = os.path.join(input_folder, 'todo.sqlite') else: todo_file = os.path.abspath(input_folder) input_folder = os.path.dirname(input_folder) # Choose which classifier to use # For now, there is only one... current_classifier = args.classifier # Initialize training set: tsetclass = starclass.get_trainingset(args.trainingset) tset = tsetclass(level=args.level, linfit=args.linfit) # Running: # When simply running the classifier on new stars: stcl = None with starclass.TaskManager(todo_file, overwrite=args.overwrite, classes=tset.StellarClasses) as tm: while True: task = tm.get_task(classifier=current_classifier) if task is None: break tm.start_task(task) if task['classifier'] != current_classifier or stcl is None: current_classifier = task['classifier'] if stcl: stcl.close() stcl = starclass.get_classifier(current_classifier) stcl = stcl(tset=tset, features_cache=None, truncate_lightcurves=args.truncate) # ----------------- This code would run on each worker ------------------------ fname = os.path.join(input_folder, task['lightcurve']) features = stcl.load_star(task, fname) print(features) lc = features['lightcurve'] lc.show_properties() plt.close('all') lc.plot() res = task.copy() tic_predict = default_timer() res['starclass_results'] = stcl.classify(features) toc_predict = default_timer() # ----------------- This code would run on each worker ------------------------ # Pad results with metadata and return to TaskManager to be saved: res.update({ 'tset': tset.key, 'status': starclass.STATUS.OK, 'elaptime': toc_predict - tic_predict }) tm.save_results(res)
def test_baseclassifier_load_star(PRIVATE_INPUT_DIR, linfit): # Use the following training set as input: tsetclass = get_trainingset() tset = tsetclass(linfit=linfit) # Set a dummy features cache inside the private input dir: features_cache_name = 'features_cache' if linfit: features_cache_name += '_linfit' features_cache = os.path.join(PRIVATE_INPUT_DIR, features_cache_name) os.makedirs(features_cache, exist_ok=True) # The features cache should be empty to begin with: assert len(os.listdir(features_cache)) == 0 with TaskManager(PRIVATE_INPUT_DIR) as tm: for k in range(2): # Try loading twice - second time we should load from cache with BaseClassifier(tset=tset, features_cache=features_cache) as cl: # Check that the second time there is something in the features cache: if k > 0: assert os.listdir(features_cache) == ['features-17.pickle'] task = tm.get_task(priority=17) print(task) feat = cl.load_star(task) print(feat) # Check the complex objects: assert isinstance(feat['lightcurve'], TessLightCurve) assert isinstance(feat['powerspectrum'], powerspectrum) assert isinstance(feat['frequencies'], Table) # Check "transfered" features: assert feat['priority'] == 17 assert feat['priority'] == task['priority'] assert feat['starid'] == task['starid'] assert feat['tmag'] == task['tmag'] assert feat['variance'] == task['variance'] assert feat['rms_hour'] == task['rms_hour'] assert feat['ptp'] == task['ptp'] # Check FliPer: assert np.isfinite(feat['Fp07']) assert np.isfinite(feat['Fp7']) assert np.isfinite(feat['Fp20']) assert np.isfinite(feat['Fp50']) assert np.isfinite(feat['FpWhite']) assert np.isfinite(feat['Fphi']) assert np.isfinite(feat['Fplo']) # Check frequencies: freqtab = feat['frequencies'] for k in np.unique(freqtab['num']): assert np.isfinite(feat['freq%d' % k]) or np.isnan(feat['freq%d' % k]), "Invalid frequency" assert np.isfinite(feat['amp%d' % k]) or np.isnan(feat['amp%d' % k]), "Invalid amplitude" assert np.isfinite(feat['phase%d' % k]) or np.isnan(feat['phase%d' % k]), "Invalid phase" peak = freqtab[(freqtab['num'] == k) & (freqtab['harmonic'] == 0)] np.testing.assert_allclose(feat['freq%d' % k], peak['frequency']) np.testing.assert_allclose(feat['amp%d' % k], peak['amplitude']) np.testing.assert_allclose(feat['phase%d' % k], peak['phase']) # Check details about lightkurve object: lc = feat['lightcurve'] lc.show_properties() assert lc.targetid == feat['starid'] assert lc.label == 'TIC %d' % feat['starid'] assert lc.mission == 'TESS' assert lc.time_format == 'btjd' assert lc.time_format == 'btjd' assert lc.camera == 1 assert lc.ccd == 4 assert lc.sector == 1 # When running with linfit enabled, the features should contain # an extra set of coefficients from the detrending: if linfit: assert 'detrend_coeff' in feat assert len(feat['detrend_coeff']) == 2 assert np.all(np.isfinite(feat['detrend_coeff'])) else: assert 'detrend_coeff' not in feat
def test_linfit(PRIVATE_INPUT_DIR): fname = os.path.join(PRIVATE_INPUT_DIR, 'tess00029281992-s01-c1800-dr01-v04-tasoc-cbv_lc.fits.gz') # Use the following training set as input: tsetclass = get_trainingset() tset = tsetclass(linfit=True) with BaseClassifier(tset=tset) as cl: # This is only used to easier load the original lightcurve: task = {'priority': 1, 'starid': 29281992, 'tmag': None, 'variance': None, 'rms_hour': None, 'ptp': None, 'other_classifiers': None, 'lightcurve': fname} feat = cl.load_star(task) lc = feat['lightcurve'] p_rem = feat['detrend_coeff'] # Remove any trend from the lightcurve: indx = np.isfinite(lc.time) & np.isfinite(lc.flux) & np.isfinite(lc.flux_err) mintime = np.nanmin(lc.time[indx]) lc -= np.polyval(p_rem, lc.time - mintime) # Insert a new known trend in the lightcurve: p_ins = [500, 1234] time_orig = lc.time lintrend_input = np.polyval(p_ins, lc.time - mintime) lc.flux += lintrend_input # Save the modified lightcurve to a file: fname_modified = fname.replace('.fits.gz', '.txt') with open(fname_modified, 'wt') as fid: for k in range(len(lc)): fid.write("{0:.12f} {1:.18e} {2:.18e}\n".format(lc.time[k], lc.flux[k], lc.flux_err[k])) # Now load the modified task['lightcurve'] = fname_modified feat = cl.load_star(task) lc = feat['lightcurve'] psd2 = feat['powerspectrum'] p = feat['detrend_coeff'] print(p) lintrend_recovered = np.polyval(p, lc.time - mintime) psd = powerspectrum(lc) # Create debugging figure: fig, (ax1, ax2) = plt.subplots(2, figsize=(12,12)) ax1.plot(lc.time, lc.flux, lw=0.5, label='Original') ax1.plot(time_orig, lintrend_input, lw=0.5, label='Input') ax1.plot(lc.time, lintrend_recovered, lw=0.5, label='Recovered') ax1.legend() ax2.plot(psd.standard[0], psd.standard[1], lw=0.5, label='Original') ax2.plot(psd2.standard[0], psd2.standard[1], lw=0.5, label='Detrended') ax2.set_yscale('log') ax2.legend() # Make sure we recover the trend that we put in: np.testing.assert_allclose(p, p_ins) # Compare the power spectra: np.testing.assert_allclose(psd.standard[0], psd2.standard[0]) assert np.all(psd2.standard[1][0:2] < psd.standard[1][0:2])
def main(): # Parse command line arguments: parser = argparse.ArgumentParser( description='Run TESS Corrections in parallel using MPI.') parser.add_argument('-d', '--debug', help='Print debug messages.', action='store_true') parser.add_argument('-q', '--quiet', help='Only report warnings and errors.', action='store_true') parser.add_argument('-o', '--overwrite', help='Overwrite existing results.', action='store_true') parser.add_argument('--chunks', type=int, default=10, help="Number of tasks sent to each worker at a time.") parser.add_argument( '--clear-cache', help= 'Clear existing features cache tables before running. Can only be used together with --overwrite.', action='store_true') # Option to select which classifier to run: parser.add_argument( '-c', '--classifier', default=None, choices=starclass.classifier_list, metavar='{CLASSIFIER}', help= 'Classifier to run. Default is to run all classifiers. Choises are ' + ", ".join(starclass.classifier_list) + '.') # Option to select training set: parser.add_argument( '-t', '--trainingset', default='keplerq9v3', choices=starclass.trainingset_list, metavar='{TSET}', help='Train classifier using this training-set. Choises are ' + ", ".join(starclass.trainingset_list) + '.') parser.add_argument('-l', '--level', help='Classification level', default='L1', choices=('L1', 'L2')) parser.add_argument('--linfit', help='Enable linfit in training set.', action='store_true') #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"? # Lightcurve truncate override switch: group = parser.add_mutually_exclusive_group(required=False) group.add_argument('--truncate', dest='truncate', action='store_true', help='Force light curve truncation.') group.add_argument('--no-truncate', dest='truncate', action='store_false', help='Force no light curve truncation.') parser.set_defaults(truncate=None) # Data directory: parser.add_argument( '--datadir', type=str, default=None, help= 'Directory where trained models and diagnostics will be loaded. Default is to load from the programs data directory.' ) # Input folder: parser.add_argument( 'input_folder', type=str, help= 'Input directory. This directory should contain a TODO-file and corresponding lightcurves.', nargs='?', default=None) args = parser.parse_args() # Cache tables (MOAT) should not be cleared unless results tables are also cleared. # Otherwise we could end up with non-complete MOAT tables. if args.clear_cache and not args.overwrite: parser.error("--clear-cache can not be used without --overwrite") # Make sure chunks are sensible: if args.chunks < 1: parser.error("--chunks should be an integer larger than 0.") # Get input and output folder from environment variables: input_folder = args.input_folder if input_folder is None: input_folder = os.environ.get('STARCLASS_INPUT') if not input_folder: parser.error("Please specify an INPUT_FOLDER.") if not os.path.exists(input_folder): parser.error("INPUT_FOLDER does not exist") if os.path.isdir(input_folder): todo_file = os.path.join(input_folder, 'todo.sqlite') else: todo_file = os.path.abspath(input_folder) input_folder = os.path.dirname(input_folder) # Initialize the training set: tsetclass = starclass.get_trainingset(args.trainingset) tset = tsetclass(level=args.level, linfit=args.linfit) # Define MPI message tags tags = enum.IntEnum('tags', ('READY', 'DONE', 'EXIT', 'START')) # Initializations and preliminaries comm = MPI.COMM_WORLD # get MPI communicator object size = comm.size # total number of processes rank = comm.rank # rank of this process status = MPI.Status() # get MPI status object if rank == 0: try: with starclass.TaskManager(todo_file, cleanup=True, overwrite=args.overwrite, classes=tset.StellarClasses) as tm: # If we were asked to do so, start by clearing the existing MOAT tables: if args.overwrite and args.clear_cache: tm.moat_clear() # Get list of tasks: #numtasks = tm.get_number_tasks() #tm.logger.info("%d tasks to be run", numtasks) # Number of available workers: num_workers = size - 1 # Create a set of initial classifiers to initialize the workers as: # If nothing was specified run all classifiers, and automatically switch between them: if args.classifier is None: change_classifier = True initial_classifiers = [] for k, c in enumerate(itertools.cycle(tm.all_classifiers)): if k >= num_workers: break initial_classifiers.append(c) else: initial_classifiers = [args.classifier] * num_workers change_classifier = False tm.logger.info("Initial classifiers: %s", initial_classifiers) # Start the master loop that will assign tasks # to the workers: closed_workers = 0 tm.logger.info("Master starting with %d workers", num_workers) while closed_workers < num_workers: # Ask workers for information: data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() tag = status.Get_tag() if tag == tags.DONE: # The worker is done with a task tm.logger.debug("Got data from worker %d: %s", source, data) tm.save_results(data) if tag in (tags.DONE, tags.READY): # Worker is ready, so send it a task # If provided, try to find a task that is with the same classifier cl = initial_classifiers[ source - 1] if data is None else data[0].get('classifier') tasks = tm.get_task( classifier=cl, change_classifier=change_classifier, chunk=args.chunks) if tasks: tm.start_task(tasks) tm.logger.debug("Sending %d tasks to worker %d", len(tasks), source) comm.send(tasks, dest=source, tag=tags.START) else: comm.send(None, dest=source, tag=tags.EXIT) elif tag == tags.EXIT: # The worker has exited tm.logger.info("Worker %d exited.", source) closed_workers += 1 else: # pragma: no cover # This should never happen, but just to # make sure we don't run into an infinite loop: raise RuntimeError( f"Master received an unknown tag: '{tag}'") tm.logger.info("Master finishing") except: # noqa: E722, pragma: no cover # If something fails in the master print(traceback.format_exc().strip()) comm.Abort(1) else: # Worker processes execute code below # Configure logging within starclass: formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') console = logging.StreamHandler() console.setFormatter(formatter) logger = logging.getLogger('starclass') logger.addHandler(console) logger.setLevel(logging.WARNING) # Get the class for the selected method: current_classifier = None stcl = None try: # Send signal that we are ready for task: comm.send(None, dest=0, tag=tags.READY) while True: # Receive a task from the master: tic_wait = default_timer() tasks = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) tag = status.Get_tag() toc_wait = default_timer() if tag == tags.START: # Make sure we can loop through tasks, # even in the case we have only gotten one: results = [] if isinstance(tasks, dict): tasks = [tasks] # Run the classification prediction: if tasks[0][ 'classifier'] != current_classifier or stcl is None: current_classifier = tasks[0]['classifier'] if stcl: stcl.close() stcl = starclass.get_classifier(current_classifier) stcl = stcl(tset=tset, features_cache=None, truncate_lightcurves=args.truncate, data_dir=args.datadir) # Loop through the tasks given to us: for task in tasks: result = stcl.classify(task) # Pad results with metadata and return to TaskManager to be saved: result['worker_wait_time'] = toc_wait - tic_wait results.append(result) # Send the result back to the master: comm.send(results, dest=0, tag=tags.DONE) # Attempt some cleanup: # TODO: Is this even needed? del task, result elif tag == tags.EXIT: # We were told to EXIT, so lets do that break else: # pragma: no cover # This should never happen, but just to # make sure we don't run into an infinite loop: raise RuntimeError( f"Worker received an unknown tag: '{tag}'") except: # noqa: E722, pragma: no cover logger.exception("Something failed in worker") finally: comm.send(None, dest=0, tag=tags.EXIT)
def main(): # Parse command line arguments: parser = argparse.ArgumentParser( description='Utility function for training stellar classifiers.') parser.add_argument('-d', '--debug', help='Print debug messages.', action='store_true') parser.add_argument('-q', '--quiet', help='Only report warnings and errors.', action='store_true') parser.add_argument('-o', '--overwrite', help='Overwrite existing results.', action='store_true') parser.add_argument('-c', '--classifier', help='Classifier to train.', default='meta', choices=starclass.classifier_list) parser.add_argument('-l', '--level', help='Classification level', default='L1', choices=('L1', 'L2')) #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"? parser.add_argument('-t', '--trainingset', help='Train classifier using this training-set.', default='keplerq9v3', choices=starclass.trainingset_list) parser.add_argument('--linfit', help='Enable linfit in training set.', action='store_true') parser.add_argument('-tf', '--testfraction', help='Holdout/test-set fraction', type=float, default=0.0) args = parser.parse_args() # Check args if args.testfraction < 0 or args.testfraction >= 1: parser.error('Testfraction must be between 0 and 1') # Set logging level: logging_level = logging.INFO if args.quiet: logging_level = logging.WARNING elif args.debug: logging_level = logging.DEBUG # Setup logging: formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') console = logging.StreamHandler() console.setFormatter(formatter) logger = logging.getLogger(__name__) logger.addHandler(console) logger.setLevel(logging_level) logger_parent = logging.getLogger('starclass') logger_parent.addHandler(console) logger_parent.setLevel(logging_level) # Choose which classifier to use current_classifier = args.classifier # Pick the training set: tsetclass = starclass.get_trainingset(args.trainingset) tset = tsetclass(level=args.level, tf=args.testfraction, linfit=args.linfit) # The Meta-classifier requires us to first train all of the other classifiers # using cross-validation if current_classifier == 'meta': # Loop through all the other classifiers and initialize them: # TODO: Run in parallel? # TODO: Check if results are already present with starclass.TaskManager(tset.todo_file, overwrite=args.overwrite, classes=tset.StellarClasses) as tm: for cla_key in tm.all_classifiers: # Split the tset object into cross-validation folds. # These are objects with exactly the same properties as the original one, # except that they will run through different subsets of the training and test sets: cla = starclass.get_classifier(cla_key) for tset_fold in tset.folds(n_splits=5, tf=0.2): data_dir = tset.key + '/meta_fold{0:02d}'.format( tset_fold.fold) with cla(tset=tset, features_cache=tset.features_cache, data_dir=data_dir) as stcl: logger.info('Training %s on Fold %d/%d...', stcl.classifier_key, tset_fold.fold, tset_fold.crossval_folds) stcl.train(tset_fold) logger.info("Classifying test-set...") stcl.test(tset_fold, save=tm.save_results) # Now train all classifiers on the full training-set (minus the holdout-set), # and test on the holdout set: with cla(tset=tset, features_cache=tset.features_cache) as stcl: logger.info('Training %s on full training-set...', stcl.classifier_key) stcl.train(tset) logger.info("Classifying test-set using %s...", stcl.classifier_key) stcl.test(tset, save=tm.save_results) # Initialize the classifier: classifier = starclass.get_classifier(current_classifier) with starclass.TaskManager(tset.todo_file, overwrite=False, classes=tset.StellarClasses) as tm: with classifier(tset=tset, features_cache=tset.features_cache) as stcl: # Run the training of the classifier: logger.info("Training %s on full training-set...", current_classifier) stcl.train(tset) logger.info("Training done...") logger.info("Classifying test-set using %s...", current_classifier) stcl.test(tset, save=tm.save_results)
with starclass.MetaClassifier() as meta: if not meta.classifier.trained: raise Exception("Not trained") feature_names = [ '{0:s}_{1:s}'.format(classifier, stcl.name) for classifier, stcl in meta.features_used ] class_names = np.unique([ '{0:s}'.format(stcl.name) for classifier, stcl in meta.features_used ]) tsetclass = starclass.get_trainingset('keplerq9v2') tset = tsetclass() fitlabels = tset.labels() # Create table of features, just like it is done in the classifier: features = meta.build_features_table(tset.features(), total=len(tset)) X_train, X_test, y_train, y_test = train_test_split(features, fitlabels, test_size=0.1, random_state=42) explainer = shap.TreeExplainer(meta.classifier) shap_values = explainer.shap_values(X_test) fig = shap.summary_plot(shap_values,
def main(): # Parse command line arguments: parser = argparse.ArgumentParser(description='Run TESS Corrections in parallel using MPI.') parser.add_argument('-d', '--debug', help='Print debug messages.', action='store_true') parser.add_argument('-q', '--quiet', help='Only report warnings and errors.', action='store_true') parser.add_argument('-o', '--overwrite', help='Overwrite existing results.', action='store_true') parser.add_argument('-c', '--classifier', help='Classifier to use.', default=None, choices=starclass.classifier_list) parser.add_argument('-t', '--trainingset', help='Train classifier using this training-set.', default='keplerq9v3', choices=starclass.trainingset_list) parser.add_argument('--linfit', help='Enable linfit in training set.', action='store_true') parser.add_argument('-l', '--level', help='Classification level', default='L1', choices=('L1', 'L2')) #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"? # Lightcurve truncate override switch: group = parser.add_mutually_exclusive_group(required=False) group.add_argument('--truncate', dest='truncate', action='store_true', help='Force light curve truncation.') group.add_argument('--no-truncate', dest='truncate', action='store_false', help='Force no light curve truncation.') parser.set_defaults(truncate=None) # Input folder: parser.add_argument('input_folder', type=str, help='Input directory. This directory should contain a TODO-file and corresponding lightcurves.', nargs='?', default=None) args = parser.parse_args() # Get input and output folder from environment variables: input_folder = args.input_folder if input_folder is None: input_folder = os.environ.get('STARCLASS_INPUT') if not input_folder: parser.error("Please specify an INPUT_FOLDER.") if not os.path.exists(input_folder): parser.error("INPUT_FOLDER does not exist") if os.path.isdir(input_folder): todo_file = os.path.join(input_folder, 'todo.sqlite') else: todo_file = os.path.abspath(input_folder) input_folder = os.path.dirname(input_folder) # Initialize the training set: tsetclass = starclass.get_trainingset(args.trainingset) tset = tsetclass(level=args.level, linfit=args.linfit) # Define MPI message tags tags = enum.IntEnum('tags', ('READY', 'DONE', 'EXIT', 'START')) # Initializations and preliminaries comm = MPI.COMM_WORLD # get MPI communicator object size = comm.size # total number of processes rank = comm.rank # rank of this process status = MPI.Status() # get MPI status object if rank == 0: try: with starclass.TaskManager(todo_file, cleanup=True, overwrite=args.overwrite, classes=tset.StellarClasses) as tm: # Get list of tasks: #numtasks = tm.get_number_tasks() #tm.logger.info("%d tasks to be run", numtasks) # Number of available workers: num_workers = size - 1 # Create a set of initial classifiers to initialize the workers as: initial_classifiers = [] for k, c in enumerate(itertools.cycle(tm.all_classifiers)): if k >= num_workers: break initial_classifiers.append(c) tm.logger.info("Initial classifiers: %s", initial_classifiers) # Start the master loop that will assign tasks # to the workers: closed_workers = 0 tm.logger.info("Master starting with %d workers", num_workers) while closed_workers < num_workers: # Ask workers for information: data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() tag = status.Get_tag() if tag == tags.DONE: # The worker is done with a task tm.logger.info("Got data from worker %d: %s", source, data) tm.save_results(data) if tag in (tags.DONE, tags.READY): # Worker is ready, so send it a task # If provided, try to find a task that is with the same classifier cl = initial_classifiers[source-1] if data is None else data.get('classifier') task = tm.get_task(classifier=cl, change_classifier=True) if task: tm.start_task(task) comm.send(task, dest=source, tag=tags.START) tm.logger.info("Sending task %d to worker %d", task['priority'], source) else: comm.send(None, dest=source, tag=tags.EXIT) elif tag == tags.EXIT: # The worker has exited tm.logger.info("Worker %d exited.", source) closed_workers += 1 else: # pragma: no cover # This should never happen, but just to # make sure we don't run into an infinite loop: raise Exception("Master received an unknown tag: '{0}'".format(tag)) tm.logger.info("Master finishing") except: # noqa: E722, pragma: no cover # If something fails in the master print(traceback.format_exc().strip()) comm.Abort(1) else: # Worker processes execute code below # Configure logging within starclass: formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') console = logging.StreamHandler() console.setFormatter(formatter) logger = logging.getLogger('starclass') logger.addHandler(console) logger.setLevel(logging.WARNING) # Get the class for the selected method: current_classifier = None stcl = None try: # Send signal that we are ready for task: comm.send(None, dest=0, tag=tags.READY) while True: # Receive a task from the master: tic_wait = default_timer() task = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) tag = status.Get_tag() toc_wait = default_timer() if tag == tags.START: result = task.copy() # Run the classification prediction: try: if task['classifier'] != current_classifier or stcl is None: current_classifier = task['classifier'] if stcl: stcl.close() stcl = starclass.get_classifier(current_classifier) stcl = stcl(tset=tset, features_cache=None, truncate_lightcurves=args.truncate) fname = os.path.join(input_folder, task['lightcurve']) features = stcl.load_star(task, fname) tic_predict = default_timer() result['starclass_results'] = stcl.classify(features) toc_predict = default_timer() result['elaptime'] = toc_predict - tic_predict result['status'] = starclass.STATUS.OK except: # noqa: E722, pragma: no cover # Something went wrong error_msg = traceback.format_exc().strip() result.update({ 'status': starclass.STATUS.ERROR, 'details': {'errors': [error_msg]}, }) # Pad results with metadata and return to TaskManager to be saved: result.update({ 'tset': tset.key, 'worker_wait_time': toc_wait - tic_wait }) # Send the result back to the master: comm.send(result, dest=0, tag=tags.DONE) # Attempt some cleanup: # TODO: Is this even needed? del task, result elif tag == tags.EXIT: # We were told to EXIT, so lets do that break else: # pragma: no cover # This should never happen, but just to # make sure we don't run into an infinite loop: raise Exception("Worker received an unknown tag: '{0}'".format(tag)) except: # noqa: E722, pragma: no cover logger.exception("Something failed in worker") finally: comm.send(None, dest=0, tag=tags.EXIT)
def main(): # Parse command line arguments: parser = argparse.ArgumentParser( description='Utility function for training stellar classifiers.') parser.add_argument('-d', '--debug', help='Print debug messages.', action='store_true') parser.add_argument('-q', '--quiet', help='Only report warnings and errors.', action='store_true') parser.add_argument('-o', '--overwrite', help='Overwrite existing results.', action='store_true') parser.add_argument('--log', type=str, default=None, metavar='{LOGFILE}', help="Log to file.") parser.add_argument( '--log-level', type=str, default=None, choices=['debug', 'info', 'warning', 'error'], help= "Logging level to use in file-logging. If not set, use the same level as the console." ) parser.add_argument('--clear-cache', help='Clear existing features cache before running.', action='store_true') # Option to select which classifier to train: parser.add_argument('-c', '--classifier', default='meta', choices=starclass.classifier_list, metavar='{CLASSIFIER}', help='Classifier to train. Choises are ' + ", ".join(starclass.classifier_list) + '.') # Option to select training set: parser.add_argument( '-t', '--trainingset', default='keplerq9v3', choices=starclass.trainingset_list, metavar='{TSET}', help='Train classifier using this training-set. Choises are ' + ", ".join(starclass.trainingset_list) + '.') parser.add_argument('-l', '--level', help='Classification level', default='L1', choices=('L1', 'L2')) parser.add_argument('--linfit', help='Enable linfit in training set.', action='store_true') #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"? parser.add_argument('-tf', '--testfraction', type=float, default=0.0, help='Holdout/test-set fraction') parser.add_argument( '--output', type=str, default=None, help= 'Directory where trained models and diagnostics will be saved. Default is to save in the programs data directory.' ) args = parser.parse_args() # Check args if args.testfraction < 0 or args.testfraction >= 1: parser.error('Testfraction must be between 0 and 1') # Set logging level: logging_level = logging.INFO if args.quiet: logging_level = logging.WARNING elif args.debug: logging_level = logging.DEBUG # Setup logging: formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') console = logging.StreamHandler() console.setFormatter(formatter) console.setLevel(logging_level) logger = logging.getLogger(__name__) logger.addHandler(console) logger_parent = logging.getLogger('starclass') logger_parent.addHandler(console) # Add log-file if the user asked for it: if args.log is not None: filehandler = logging.FileHandler(args.log, mode='w', encoding='utf8') filehandler.setFormatter(formatter) filehandler.setLevel(logging_level if args.log_level is None else args. log_level.upper()) logging_level = min(logging_level, filehandler.level) logger.addHandler(filehandler) logger_parent.addHandler(filehandler) # The logging level of the logger objects needs to be the smallest # logging level enabled in either of the handlers: logger.setLevel(logging_level) logger_parent.setLevel(logging_level) # Pick the training set: tsetclass = starclass.get_trainingset(args.trainingset) tset = tsetclass(level=args.level, tf=args.testfraction, linfit=args.linfit) # If we were asked to do so, clear the cache before proceding: if args.clear_cache: tset.clear_cache() # The Meta-classifier requires us to first train all of the other classifiers # using cross-validation if args.classifier == 'meta': # Loop through all the other classifiers and initialize them: # TODO: Run in parallel? # TODO: Check if results are already present with starclass.TaskManager(tset.todo_file, overwrite=args.overwrite, classes=tset.StellarClasses) as tm: # Loop through all classifiers, excluding the MetaClassifier: for cla_key in tm.all_classifiers: # Split the tset object into cross-validation folds. # These are objects with exactly the same properties as the original one, # except that they will run through different subsets of the training and test sets: cla = starclass.get_classifier(cla_key) for tset_fold in tset.folds(n_splits=5, tf=0.2): with cla(tset=tset_fold, features_cache=tset.features_cache, data_dir=args.output) as stcl: logger.info('Training %s on Fold %d/%d...', stcl.classifier_key, tset_fold.fold, tset_fold.crossval_folds) stcl.train(tset_fold) logger.info("Training done.") logger.info("Classifying test-set using %s...", stcl.classifier_key) stcl.test(tset_fold, save=tm.save_results) # Now train all classifiers on the full training-set (minus the holdout-set), # and test on the holdout set: with cla(tset=tset, features_cache=tset.features_cache, data_dir=args.output) as stcl: logger.info('Training %s on full training-set...', stcl.classifier_key) stcl.train(tset) logger.info("Training done.") logger.info("Classifying holdout-set using %s...", stcl.classifier_key) stcl.test(tset, save=tm.save_results, feature_importance=True) # Initialize the classifier: classifier = starclass.get_classifier(args.classifier) with starclass.TaskManager(tset.todo_file, overwrite=False, classes=tset.StellarClasses) as tm: with classifier(tset=tset, features_cache=tset.features_cache, data_dir=args.output) as stcl: # Run the training of the classifier: logger.info("Training %s on full training-set...", args.classifier) stcl.train(tset) logger.info("Training done.") logger.info("Classifying holdout-set using %s...", args.classifier) stcl.test(tset, save=tm.save_results, feature_importance=True)