Ejemplo n.º 1
0
def test_trainingset(tsetkey, linfit):

	# Get training set class using conv. function:
	tsetclass = get_trainingset(tsetkey)

	for testfraction in (0, 0.2):
		tset = tsetclass(tf=testfraction, linfit=linfit)
		print(tset)

		if linfit:
			assert tset.key == tsetkey + '-linfit'
		else:
			assert tset.key == tsetkey
		assert tset.level == 'L1'
		assert tset.datalevel == 'corr'
		assert tset.testfraction == testfraction
		assert len(tset) > 0

	# Invalid level should give ValueError:
	with pytest.raises(ValueError):
		tsetclass(level='nonsense')

	# Test-fractions which should all return in a ValueError:
	with pytest.raises(ValueError):
		tset = tsetclass(tf=1.2)
	with pytest.raises(ValueError):
		tset = tsetclass(tf=1.0)
	with pytest.raises(ValueError):
		tset = tsetclass(tf=-0.2)

	# Calling with invalid datalevel should throw an error as well:
	with pytest.raises(ValueError):
		tset = tsetclass(datalevel='nonsense')

	tset = tsetclass(tf=0, linfit=linfit)
	print(tset)
	lbls = tset.labels()
	lbls_test = tset.labels_test()
	print(tset.nobjects)
	print(len(lbls), len(lbls_test))

	assert len(lbls) == tset.nobjects
	assert len(lbls_test) == 0

	tset = tsetclass(tf=0.2, linfit=linfit)
	print(tset)
	lbls = tset.labels()
	lbls_test = tset.labels_test()
	print(tset.nobjects)
	print(len(lbls), len(lbls_test))

	assert len(lbls) + len(lbls_test) == tset.nobjects
Ejemplo n.º 2
0
def test_trainingset_generate_todolist(monkeypatch, tsetkey, linfit):
	# Get training set class using conv. function:
	tsetclass = get_trainingset(tsetkey)
	tset = tsetclass(linfit=linfit)
	input_folder = tset.input_folder
	print("Training Set input folder: %s" % input_folder)

	with tempfile.TemporaryDirectory(prefix='pytest-private-tsets-') as tmpdir:
		# Create a copy of the root fies of the trainings set (ignore that actual data)
		# in the temp. directory:
		tsetdir = os.path.join(tmpdir, os.path.basename(input_folder))
		print("New dummy input folder: %s" % tsetdir)
		os.makedirs(tsetdir)
		for f in os.listdir(input_folder):
			fpath = os.path.join(input_folder, f)
			if os.path.isfile(fpath) and not f.endswith(('.sqlite', '.sqlite-journal')):
				shutil.copy(fpath, tsetdir)
			elif os.path.isdir(fpath) and not f.startswith('features_cache'):
				# NOTE: We are cheating, and creating an empty file with
				# the correct name, since the file is actually not
				# needed for building the todolist, it only needs to exist.
				os.makedirs(os.path.join(tsetdir, f))
				for subf in os.listdir(fpath):
					open(os.path.join(tsetdir, f, subf), 'w').close()

		# Create a fake features_cache directory, which just contain one dummy file:
		new_featdir = os.path.join(tsetdir, os.path.basename(tset.features_cache))
		os.makedirs(new_featdir)
		open(os.path.join(new_featdir, 'dummy.txt'), 'w').close()

		# Change the environment variable to the temp. dir:
		monkeypatch.setenv("STARCLASS_TSETS", tmpdir)
		print(os.environ['STARCLASS_TSETS'])

		# When we now initialize the trainingset it should run generate_todo automatically:
		tset = tsetclass(linfit=linfit)

		# Check that the todo-file was indeed created:
		assert tset.input_folder == tsetdir
		assert os.path.isfile(os.path.join(tsetdir, tset._todo_name + '.sqlite'))

		# Make sure that the dummy features_cache dir was created and picked up:
		assert os.path.isdir(tset.features_cache)
		assert os.listdir(tset.features_cache) == ['dummy.txt']

		# Let's clear the features cache:
		tset.clear_cache()

		# Now the features_cache directory should be gone:
		assert not os.path.exists(tset.features_cache), "features_cache still exists"
Ejemplo n.º 3
0
def test_trainingset_features(tsetkey, linfit):

	# Get training set class using conv. function:
	tsetclass = get_trainingset(tsetkey)
	tset = tsetclass(tf=0.2, linfit=linfit)

	features = tset.features()
	assert isinstance(features, types.GeneratorType)

	features_test = tset.features_test()
	assert isinstance(features_test, types.GeneratorType)

	for tries in range(2):
		feat = next(features)
		print(feat)
		assert isinstance(feat, dict)
		assert 'lightcurve' in feat
		assert 'powerspectrum' in feat
		assert 'frequencies' in feat
Ejemplo n.º 4
0
def test_trainingset_folds(tsetkey, linfit):

	# Get training set class using conv. function:
	tsetclass = get_trainingset(tsetkey)
	tset = tsetclass(linfit=linfit)

	for k, fold in enumerate(tset.folds(n_splits=5, tf=0.2)):
		assert isinstance(fold, tsetclass)
		assert fold.key == tset.key
		assert fold.crossval_folds == 5
		assert fold.fold == k + 1
		assert fold.testfraction == 0.2
		assert fold.level == tset.level
		assert fold.random_seed == tset.random_seed
		assert len(fold.train_idx) > 0
		assert len(fold.test_idx) > 0
		assert len(fold.train_idx) > len(fold.test_idx)
		assert len(fold.train_idx) < len(tset.train_idx)

	assert k == 4, "Not the correct number of folds"
Ejemplo n.º 5
0
def main():
    # Parse command line arguments:
    parser = argparse.ArgumentParser(
        description='Command-line interface for running stellar classifiers.')
    parser.add_argument('-d',
                        '--debug',
                        help='Print debug messages.',
                        action='store_true')
    parser.add_argument('-q',
                        '--quiet',
                        help='Only report warnings and errors.',
                        action='store_true')
    parser.add_argument('-o',
                        '--overwrite',
                        help='Overwrite existing results.',
                        action='store_true')
    parser.add_argument(
        '--clear-cache',
        help=
        'Clear existing features cache tables before running. Can only be used together with --overwrite.',
        action='store_true')
    # Option to select which classifier to run:
    parser.add_argument(
        '-c',
        '--classifier',
        default=None,
        choices=starclass.classifier_list,
        metavar='{CLASSIFIER}',
        help=
        'Classifier to run. Default is to run all classifiers. Choises are ' +
        ", ".join(starclass.classifier_list) + '.')
    # Option to select training set:
    parser.add_argument(
        '-t',
        '--trainingset',
        default='keplerq9v3',
        choices=starclass.trainingset_list,
        metavar='{TSET}',
        help='Train classifier using this training-set. Choises are ' +
        ", ".join(starclass.trainingset_list) + '.')

    parser.add_argument('-l',
                        '--level',
                        help='Classification level.',
                        default='L1',
                        choices=('L1', 'L2'))
    parser.add_argument('--linfit',
                        help='Enable linfit in training set.',
                        action='store_true')
    #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"?
    #parser.add_argument('--starid', type=int, help='TIC identifier of target.', nargs='?', default=None)
    # Lightcurve truncate override switch:
    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument('--truncate',
                       dest='truncate',
                       action='store_true',
                       help='Force light curve truncation.')
    group.add_argument('--no-truncate',
                       dest='truncate',
                       action='store_false',
                       help='Force no light curve truncation.')
    parser.set_defaults(truncate=None)
    # Data directory:
    parser.add_argument(
        '--datadir',
        type=str,
        default=None,
        help=
        'Directory where trained models and diagnostics will be loaded. Default is to load from the programs data directory.'
    )
    # Input todo-file/directory:
    parser.add_argument('input_folder',
                        type=str,
                        nargs='?',
                        default=None,
                        help='Input directory to run classification on.')
    args = parser.parse_args()

    # Cache tables (MOAT) should not be cleared unless results tables are also cleared.
    # Otherwise we could end up with non-complete MOAT tables.
    if args.clear_cache and not args.overwrite:
        parser.error("--clear-cache can not be used without --overwrite")

    # Set logging level:
    logging_level = logging.INFO
    if args.quiet:
        logging_level = logging.WARNING
    elif args.debug:
        logging_level = logging.DEBUG

    # Setup logging:
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    logger = logging.getLogger(__name__)
    logger.addHandler(console)
    logger.setLevel(logging_level)
    logger_parent = logging.getLogger('starclass')
    logger_parent.addHandler(console)
    logger_parent.setLevel(logging_level)

    # Get input and output folder from environment variables:
    input_folder = args.input_folder
    if input_folder is None:
        input_folder = os.environ.get('STARCLASS_INPUT')
    if input_folder is None:
        parser.error("No input folder specified")
    if not os.path.exists(input_folder):
        parser.error("INPUT_FOLDER does not exist")
    if os.path.isdir(input_folder):
        todo_file = os.path.join(input_folder, 'todo.sqlite')
    else:
        todo_file = os.path.abspath(input_folder)
        input_folder = os.path.dirname(input_folder)

    # Choose which classifier to use:
    # If nothing was specified, run all classifiers, and automatically switch between them:
    if args.classifier is None:
        current_classifier = starclass.classifier_list[0]
        change_classifier = True
    else:
        current_classifier = args.classifier
        change_classifier = False

    # Initialize training set:
    tsetclass = starclass.get_trainingset(args.trainingset)
    tset = tsetclass(level=args.level, linfit=args.linfit)

    # Running:
    # When simply running the classifier on new stars:
    stcl = None
    with starclass.TaskManager(todo_file,
                               overwrite=args.overwrite,
                               classes=tset.StellarClasses) as tm:
        # If we were asked to do so, start by clearing the existing MOAT tables:
        if args.overwrite and args.clear_cache:
            tm.moat_clear()

        while True:
            tasks = tm.get_task(classifier=current_classifier,
                                change_classifier=change_classifier)
            if tasks is None:
                break
            tm.start_task(tasks)

            # ----------------- This code would run on each worker ------------------------

            # Make sure we can loop through tasks,
            # even in the case we have only gotten one:
            results = []
            if isinstance(tasks, dict):
                tasks = [tasks]

            if tasks[0]['classifier'] != current_classifier or stcl is None:
                current_classifier = tasks[0]['classifier']
                if stcl:
                    stcl.close()
                stcl = starclass.get_classifier(current_classifier)
                stcl = stcl(tset=tset,
                            features_cache=None,
                            truncate_lightcurves=args.truncate,
                            data_dir=args.datadir)

            for task in tasks:
                res = stcl.classify(task)
                results.append(res)

            # ----------------- This code would run on each worker ------------------------

            # Return to TaskManager to be saved:
            tm.save_results(results)
Ejemplo n.º 6
0
def main():
	# Parse command line arguments:
	parser = argparse.ArgumentParser(description='Utility function for running stellar classifiers.')
	parser.add_argument('-d', '--debug', help='Print debug messages.', action='store_true')
	parser.add_argument('-q', '--quiet', help='Only report warnings and errors.', action='store_true')
	parser.add_argument('-o', '--overwrite', help='Overwrite existing results.', action='store_true')
	parser.add_argument('-c', '--classifier', help='Classifier to use.', default='rfgc', choices=starclass.classifier_list)
	parser.add_argument('-t', '--trainingset', help='Train classifier using this training-set.', default='keplerq9v3', choices=starclass.trainingset_list)
	parser.add_argument('--linfit', help='Enable linfit in training set.', action='store_true')
	parser.add_argument('-l', '--level', help='Classification level', default='L1', choices=('L1', 'L2'))
	#parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"?
	#parser.add_argument('--starid', type=int, help='TIC identifier of target.', nargs='?', default=None)
	# Lightcurve truncate override switch:
	group = parser.add_mutually_exclusive_group(required=False)
	group.add_argument('--truncate', dest='truncate', action='store_true', help='Force light curve truncation.')
	group.add_argument('--no-truncate', dest='truncate', action='store_false', help='Force no light curve truncation.')
	parser.set_defaults(truncate=None)
	parser.add_argument('input_folder', type=str, help='Input directory to run classification on.', nargs='?', default=None)
	args = parser.parse_args()

	# Set logging level:
	logging_level = logging.INFO
	if args.quiet:
		logging_level = logging.WARNING
	elif args.debug:
		logging_level = logging.DEBUG

	# Setup logging:
	formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
	console = logging.StreamHandler()
	console.setFormatter(formatter)
	logger = logging.getLogger(__name__)
	logger.addHandler(console)
	logger.setLevel(logging_level)
	logger_parent = logging.getLogger('starclass')
	logger_parent.addHandler(console)
	logger_parent.setLevel(logging_level)

	# Get input and output folder from environment variables:
	input_folder = args.input_folder
	if input_folder is None:
		input_folder = os.environ.get('STARCLASS_INPUT')
	if input_folder is None:
		parser.error("No input folder specified")
	if not os.path.exists(input_folder):
		parser.error("INPUT_FOLDER does not exist")
	if os.path.isdir(input_folder):
		todo_file = os.path.join(input_folder, 'todo.sqlite')
	else:
		todo_file = os.path.abspath(input_folder)
		input_folder = os.path.dirname(input_folder)

	# Choose which classifier to use
	# For now, there is only one...
	current_classifier = args.classifier

	# Initialize training set:
	tsetclass = starclass.get_trainingset(args.trainingset)
	tset = tsetclass(level=args.level, linfit=args.linfit)

	# Running:
	# When simply running the classifier on new stars:
	stcl = None
	with starclass.TaskManager(todo_file, overwrite=args.overwrite, classes=tset.StellarClasses) as tm:
		while True:
			task = tm.get_task(classifier=current_classifier)
			if task is None:
				break
			tm.start_task(task)

			if task['classifier'] != current_classifier or stcl is None:
				current_classifier = task['classifier']
				if stcl:
					stcl.close()
				stcl = starclass.get_classifier(current_classifier)
				stcl = stcl(tset=tset, features_cache=None, truncate_lightcurves=args.truncate)

			# ----------------- This code would run on each worker ------------------------

			fname = os.path.join(input_folder, task['lightcurve'])
			features = stcl.load_star(task, fname)

			print(features)
			lc = features['lightcurve']
			lc.show_properties()

			plt.close('all')
			lc.plot()

			res = task.copy()

			tic_predict = default_timer()
			res['starclass_results'] = stcl.classify(features)
			toc_predict = default_timer()

			# ----------------- This code would run on each worker ------------------------

			# Pad results with metadata and return to TaskManager to be saved:
			res.update({
				'tset': tset.key,
				'status': starclass.STATUS.OK,
				'elaptime': toc_predict - tic_predict
			})
			tm.save_results(res)
Ejemplo n.º 7
0
def test_baseclassifier_load_star(PRIVATE_INPUT_DIR, linfit):

	# Use the following training set as input:
	tsetclass = get_trainingset()
	tset = tsetclass(linfit=linfit)

	# Set a dummy features cache inside the private input dir:
	features_cache_name = 'features_cache'
	if linfit:
		features_cache_name += '_linfit'
	features_cache = os.path.join(PRIVATE_INPUT_DIR, features_cache_name)
	os.makedirs(features_cache, exist_ok=True)

	# The features cache should be empty to begin with:
	assert len(os.listdir(features_cache)) == 0

	with TaskManager(PRIVATE_INPUT_DIR) as tm:
		for k in range(2): # Try loading twice - second time we should load from cache
			with BaseClassifier(tset=tset, features_cache=features_cache) as cl:
				# Check that the second time there is something in the features cache:
				if k > 0:
					assert os.listdir(features_cache) == ['features-17.pickle']

				task = tm.get_task(priority=17)
				print(task)

				feat = cl.load_star(task)
				print(feat)

				# Check the complex objects:
				assert isinstance(feat['lightcurve'], TessLightCurve)
				assert isinstance(feat['powerspectrum'], powerspectrum)
				assert isinstance(feat['frequencies'], Table)

				# Check "transfered" features:
				assert feat['priority'] == 17
				assert feat['priority'] == task['priority']
				assert feat['starid'] == task['starid']
				assert feat['tmag'] == task['tmag']
				assert feat['variance'] == task['variance']
				assert feat['rms_hour'] == task['rms_hour']
				assert feat['ptp'] == task['ptp']

				# Check FliPer:
				assert np.isfinite(feat['Fp07'])
				assert np.isfinite(feat['Fp7'])
				assert np.isfinite(feat['Fp20'])
				assert np.isfinite(feat['Fp50'])
				assert np.isfinite(feat['FpWhite'])
				assert np.isfinite(feat['Fphi'])
				assert np.isfinite(feat['Fplo'])

				# Check frequencies:
				freqtab = feat['frequencies']
				for k in np.unique(freqtab['num']):
					assert np.isfinite(feat['freq%d' % k]) or np.isnan(feat['freq%d' % k]), "Invalid frequency"
					assert np.isfinite(feat['amp%d' % k]) or np.isnan(feat['amp%d' % k]), "Invalid amplitude"
					assert np.isfinite(feat['phase%d' % k]) or np.isnan(feat['phase%d' % k]), "Invalid phase"

					peak = freqtab[(freqtab['num'] == k) & (freqtab['harmonic'] == 0)]
					np.testing.assert_allclose(feat['freq%d' % k], peak['frequency'])
					np.testing.assert_allclose(feat['amp%d' % k], peak['amplitude'])
					np.testing.assert_allclose(feat['phase%d' % k], peak['phase'])

				# Check details about lightkurve object:
				lc = feat['lightcurve']
				lc.show_properties()
				assert lc.targetid == feat['starid']
				assert lc.label == 'TIC %d' % feat['starid']
				assert lc.mission == 'TESS'
				assert lc.time_format == 'btjd'
				assert lc.time_format == 'btjd'
				assert lc.camera == 1
				assert lc.ccd == 4
				assert lc.sector == 1

				# When running with linfit enabled, the features should contain
				# an extra set of coefficients from the detrending:
				if linfit:
					assert 'detrend_coeff' in feat
					assert len(feat['detrend_coeff']) == 2
					assert np.all(np.isfinite(feat['detrend_coeff']))
				else:
					assert 'detrend_coeff' not in feat
Ejemplo n.º 8
0
def test_linfit(PRIVATE_INPUT_DIR):

	fname = os.path.join(PRIVATE_INPUT_DIR, 'tess00029281992-s01-c1800-dr01-v04-tasoc-cbv_lc.fits.gz')

	# Use the following training set as input:
	tsetclass = get_trainingset()
	tset = tsetclass(linfit=True)

	with BaseClassifier(tset=tset) as cl:
		# This is only used to easier load the original lightcurve:
		task = {'priority': 1, 'starid': 29281992, 'tmag': None, 'variance': None, 'rms_hour': None, 'ptp': None, 'other_classifiers': None, 'lightcurve': fname}
		feat = cl.load_star(task)
		lc = feat['lightcurve']
		p_rem = feat['detrend_coeff']

		# Remove any trend from the lightcurve:
		indx = np.isfinite(lc.time) & np.isfinite(lc.flux) & np.isfinite(lc.flux_err)
		mintime = np.nanmin(lc.time[indx])
		lc -= np.polyval(p_rem, lc.time - mintime)

		# Insert a new known trend in the lightcurve:
		p_ins = [500, 1234]
		time_orig = lc.time
		lintrend_input = np.polyval(p_ins, lc.time - mintime)
		lc.flux += lintrend_input

		# Save the modified lightcurve to a file:
		fname_modified = fname.replace('.fits.gz', '.txt')
		with open(fname_modified, 'wt') as fid:
			for k in range(len(lc)):
				fid.write("{0:.12f}  {1:.18e}  {2:.18e}\n".format(lc.time[k], lc.flux[k], lc.flux_err[k]))

		# Now load the modified
		task['lightcurve'] = fname_modified
		feat = cl.load_star(task)
		lc = feat['lightcurve']
		psd2 = feat['powerspectrum']
		p = feat['detrend_coeff']
		print(p)

		lintrend_recovered = np.polyval(p, lc.time - mintime)

		psd = powerspectrum(lc)

		# Create debugging figure:
		fig, (ax1, ax2) = plt.subplots(2, figsize=(12,12))
		ax1.plot(lc.time, lc.flux, lw=0.5, label='Original')
		ax1.plot(time_orig, lintrend_input, lw=0.5, label='Input')
		ax1.plot(lc.time, lintrend_recovered, lw=0.5, label='Recovered')
		ax1.legend()
		ax2.plot(psd.standard[0], psd.standard[1], lw=0.5, label='Original')
		ax2.plot(psd2.standard[0], psd2.standard[1], lw=0.5, label='Detrended')
		ax2.set_yscale('log')
		ax2.legend()

		# Make sure we recover the trend that we put in:
		np.testing.assert_allclose(p, p_ins)

		# Compare the power spectra:
		np.testing.assert_allclose(psd.standard[0], psd2.standard[0])
		assert np.all(psd2.standard[1][0:2] < psd.standard[1][0:2])
Ejemplo n.º 9
0
def main():
    # Parse command line arguments:
    parser = argparse.ArgumentParser(
        description='Run TESS Corrections in parallel using MPI.')
    parser.add_argument('-d',
                        '--debug',
                        help='Print debug messages.',
                        action='store_true')
    parser.add_argument('-q',
                        '--quiet',
                        help='Only report warnings and errors.',
                        action='store_true')
    parser.add_argument('-o',
                        '--overwrite',
                        help='Overwrite existing results.',
                        action='store_true')
    parser.add_argument('--chunks',
                        type=int,
                        default=10,
                        help="Number of tasks sent to each worker at a time.")
    parser.add_argument(
        '--clear-cache',
        help=
        'Clear existing features cache tables before running. Can only be used together with --overwrite.',
        action='store_true')
    # Option to select which classifier to run:
    parser.add_argument(
        '-c',
        '--classifier',
        default=None,
        choices=starclass.classifier_list,
        metavar='{CLASSIFIER}',
        help=
        'Classifier to run. Default is to run all classifiers. Choises are ' +
        ", ".join(starclass.classifier_list) + '.')
    # Option to select training set:
    parser.add_argument(
        '-t',
        '--trainingset',
        default='keplerq9v3',
        choices=starclass.trainingset_list,
        metavar='{TSET}',
        help='Train classifier using this training-set. Choises are ' +
        ", ".join(starclass.trainingset_list) + '.')

    parser.add_argument('-l',
                        '--level',
                        help='Classification level',
                        default='L1',
                        choices=('L1', 'L2'))
    parser.add_argument('--linfit',
                        help='Enable linfit in training set.',
                        action='store_true')
    #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"?
    # Lightcurve truncate override switch:
    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument('--truncate',
                       dest='truncate',
                       action='store_true',
                       help='Force light curve truncation.')
    group.add_argument('--no-truncate',
                       dest='truncate',
                       action='store_false',
                       help='Force no light curve truncation.')
    parser.set_defaults(truncate=None)
    # Data directory:
    parser.add_argument(
        '--datadir',
        type=str,
        default=None,
        help=
        'Directory where trained models and diagnostics will be loaded. Default is to load from the programs data directory.'
    )
    # Input folder:
    parser.add_argument(
        'input_folder',
        type=str,
        help=
        'Input directory. This directory should contain a TODO-file and corresponding lightcurves.',
        nargs='?',
        default=None)
    args = parser.parse_args()

    # Cache tables (MOAT) should not be cleared unless results tables are also cleared.
    # Otherwise we could end up with non-complete MOAT tables.
    if args.clear_cache and not args.overwrite:
        parser.error("--clear-cache can not be used without --overwrite")
    # Make sure chunks are sensible:
    if args.chunks < 1:
        parser.error("--chunks should be an integer larger than 0.")

    # Get input and output folder from environment variables:
    input_folder = args.input_folder
    if input_folder is None:
        input_folder = os.environ.get('STARCLASS_INPUT')
    if not input_folder:
        parser.error("Please specify an INPUT_FOLDER.")
    if not os.path.exists(input_folder):
        parser.error("INPUT_FOLDER does not exist")
    if os.path.isdir(input_folder):
        todo_file = os.path.join(input_folder, 'todo.sqlite')
    else:
        todo_file = os.path.abspath(input_folder)
        input_folder = os.path.dirname(input_folder)

    # Initialize the training set:
    tsetclass = starclass.get_trainingset(args.trainingset)
    tset = tsetclass(level=args.level, linfit=args.linfit)

    # Define MPI message tags
    tags = enum.IntEnum('tags', ('READY', 'DONE', 'EXIT', 'START'))

    # Initializations and preliminaries
    comm = MPI.COMM_WORLD  # get MPI communicator object
    size = comm.size  # total number of processes
    rank = comm.rank  # rank of this process
    status = MPI.Status()  # get MPI status object

    if rank == 0:
        try:
            with starclass.TaskManager(todo_file,
                                       cleanup=True,
                                       overwrite=args.overwrite,
                                       classes=tset.StellarClasses) as tm:
                # If we were asked to do so, start by clearing the existing MOAT tables:
                if args.overwrite and args.clear_cache:
                    tm.moat_clear()

                # Get list of tasks:
                #numtasks = tm.get_number_tasks()
                #tm.logger.info("%d tasks to be run", numtasks)

                # Number of available workers:
                num_workers = size - 1

                # Create a set of initial classifiers to initialize the workers as:
                # If nothing was specified run all classifiers, and automatically switch between them:
                if args.classifier is None:
                    change_classifier = True
                    initial_classifiers = []
                    for k, c in enumerate(itertools.cycle(tm.all_classifiers)):
                        if k >= num_workers:
                            break
                        initial_classifiers.append(c)
                else:
                    initial_classifiers = [args.classifier] * num_workers
                    change_classifier = False

                tm.logger.info("Initial classifiers: %s", initial_classifiers)

                # Start the master loop that will assign tasks
                # to the workers:
                closed_workers = 0
                tm.logger.info("Master starting with %d workers", num_workers)
                while closed_workers < num_workers:
                    # Ask workers for information:
                    data = comm.recv(source=MPI.ANY_SOURCE,
                                     tag=MPI.ANY_TAG,
                                     status=status)
                    source = status.Get_source()
                    tag = status.Get_tag()

                    if tag == tags.DONE:
                        # The worker is done with a task
                        tm.logger.debug("Got data from worker %d: %s", source,
                                        data)
                        tm.save_results(data)

                    if tag in (tags.DONE, tags.READY):
                        # Worker is ready, so send it a task
                        # If provided, try to find a task that is with the same classifier
                        cl = initial_classifiers[
                            source -
                            1] if data is None else data[0].get('classifier')
                        tasks = tm.get_task(
                            classifier=cl,
                            change_classifier=change_classifier,
                            chunk=args.chunks)
                        if tasks:
                            tm.start_task(tasks)
                            tm.logger.debug("Sending %d tasks to worker %d",
                                            len(tasks), source)
                            comm.send(tasks, dest=source, tag=tags.START)
                        else:
                            comm.send(None, dest=source, tag=tags.EXIT)

                    elif tag == tags.EXIT:
                        # The worker has exited
                        tm.logger.info("Worker %d exited.", source)
                        closed_workers += 1

                    else:  # pragma: no cover
                        # This should never happen, but just to
                        # make sure we don't run into an infinite loop:
                        raise RuntimeError(
                            f"Master received an unknown tag: '{tag}'")

                tm.logger.info("Master finishing")

        except:  # noqa: E722, pragma: no cover
            # If something fails in the master
            print(traceback.format_exc().strip())
            comm.Abort(1)

    else:
        # Worker processes execute code below
        # Configure logging within starclass:
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s')
        console = logging.StreamHandler()
        console.setFormatter(formatter)
        logger = logging.getLogger('starclass')
        logger.addHandler(console)
        logger.setLevel(logging.WARNING)

        # Get the class for the selected method:
        current_classifier = None
        stcl = None

        try:
            # Send signal that we are ready for task:
            comm.send(None, dest=0, tag=tags.READY)

            while True:
                # Receive a task from the master:
                tic_wait = default_timer()
                tasks = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
                tag = status.Get_tag()
                toc_wait = default_timer()

                if tag == tags.START:
                    # Make sure we can loop through tasks,
                    # even in the case we have only gotten one:
                    results = []
                    if isinstance(tasks, dict):
                        tasks = [tasks]

                    # Run the classification prediction:
                    if tasks[0][
                            'classifier'] != current_classifier or stcl is None:
                        current_classifier = tasks[0]['classifier']
                        if stcl:
                            stcl.close()
                        stcl = starclass.get_classifier(current_classifier)
                        stcl = stcl(tset=tset,
                                    features_cache=None,
                                    truncate_lightcurves=args.truncate,
                                    data_dir=args.datadir)

                    # Loop through the tasks given to us:
                    for task in tasks:
                        result = stcl.classify(task)

                        # Pad results with metadata and return to TaskManager to be saved:
                        result['worker_wait_time'] = toc_wait - tic_wait
                        results.append(result)

                    # Send the result back to the master:
                    comm.send(results, dest=0, tag=tags.DONE)

                    # Attempt some cleanup:
                    # TODO: Is this even needed?
                    del task, result

                elif tag == tags.EXIT:
                    # We were told to EXIT, so lets do that
                    break

                else:  # pragma: no cover
                    # This should never happen, but just to
                    # make sure we don't run into an infinite loop:
                    raise RuntimeError(
                        f"Worker received an unknown tag: '{tag}'")

        except:  # noqa: E722, pragma: no cover
            logger.exception("Something failed in worker")

        finally:
            comm.send(None, dest=0, tag=tags.EXIT)
Ejemplo n.º 10
0
def main():
    # Parse command line arguments:
    parser = argparse.ArgumentParser(
        description='Utility function for training stellar classifiers.')
    parser.add_argument('-d',
                        '--debug',
                        help='Print debug messages.',
                        action='store_true')
    parser.add_argument('-q',
                        '--quiet',
                        help='Only report warnings and errors.',
                        action='store_true')
    parser.add_argument('-o',
                        '--overwrite',
                        help='Overwrite existing results.',
                        action='store_true')
    parser.add_argument('-c',
                        '--classifier',
                        help='Classifier to train.',
                        default='meta',
                        choices=starclass.classifier_list)
    parser.add_argument('-l',
                        '--level',
                        help='Classification level',
                        default='L1',
                        choices=('L1', 'L2'))
    #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"?
    parser.add_argument('-t',
                        '--trainingset',
                        help='Train classifier using this training-set.',
                        default='keplerq9v3',
                        choices=starclass.trainingset_list)
    parser.add_argument('--linfit',
                        help='Enable linfit in training set.',
                        action='store_true')
    parser.add_argument('-tf',
                        '--testfraction',
                        help='Holdout/test-set fraction',
                        type=float,
                        default=0.0)
    args = parser.parse_args()

    # Check args
    if args.testfraction < 0 or args.testfraction >= 1:
        parser.error('Testfraction must be between 0 and 1')

    # Set logging level:
    logging_level = logging.INFO
    if args.quiet:
        logging_level = logging.WARNING
    elif args.debug:
        logging_level = logging.DEBUG

    # Setup logging:
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    logger = logging.getLogger(__name__)
    logger.addHandler(console)
    logger.setLevel(logging_level)
    logger_parent = logging.getLogger('starclass')
    logger_parent.addHandler(console)
    logger_parent.setLevel(logging_level)

    # Choose which classifier to use
    current_classifier = args.classifier

    # Pick the training set:
    tsetclass = starclass.get_trainingset(args.trainingset)
    tset = tsetclass(level=args.level,
                     tf=args.testfraction,
                     linfit=args.linfit)

    # The Meta-classifier requires us to first train all of the other classifiers
    # using cross-validation
    if current_classifier == 'meta':
        # Loop through all the other classifiers and initialize them:
        # TODO: Run in parallel?
        # TODO: Check if results are already present
        with starclass.TaskManager(tset.todo_file,
                                   overwrite=args.overwrite,
                                   classes=tset.StellarClasses) as tm:
            for cla_key in tm.all_classifiers:
                # Split the tset object into cross-validation folds.
                # These are objects with exactly the same properties as the original one,
                # except that they will run through different subsets of the training and test sets:
                cla = starclass.get_classifier(cla_key)
                for tset_fold in tset.folds(n_splits=5, tf=0.2):
                    data_dir = tset.key + '/meta_fold{0:02d}'.format(
                        tset_fold.fold)
                    with cla(tset=tset,
                             features_cache=tset.features_cache,
                             data_dir=data_dir) as stcl:
                        logger.info('Training %s on Fold %d/%d...',
                                    stcl.classifier_key, tset_fold.fold,
                                    tset_fold.crossval_folds)
                        stcl.train(tset_fold)
                        logger.info("Classifying test-set...")
                        stcl.test(tset_fold, save=tm.save_results)

                # Now train all classifiers on the full training-set (minus the holdout-set),
                # and test on the holdout set:
                with cla(tset=tset,
                         features_cache=tset.features_cache) as stcl:
                    logger.info('Training %s on full training-set...',
                                stcl.classifier_key)
                    stcl.train(tset)
                    logger.info("Classifying test-set using %s...",
                                stcl.classifier_key)
                    stcl.test(tset, save=tm.save_results)

    # Initialize the classifier:
    classifier = starclass.get_classifier(current_classifier)
    with starclass.TaskManager(tset.todo_file,
                               overwrite=False,
                               classes=tset.StellarClasses) as tm:
        with classifier(tset=tset, features_cache=tset.features_cache) as stcl:
            # Run the training of the classifier:
            logger.info("Training %s on full training-set...",
                        current_classifier)
            stcl.train(tset)
            logger.info("Training done...")
            logger.info("Classifying test-set using %s...", current_classifier)
            stcl.test(tset, save=tm.save_results)
Ejemplo n.º 11
0
    with starclass.MetaClassifier() as meta:

        if not meta.classifier.trained:
            raise Exception("Not trained")

        feature_names = [
            '{0:s}_{1:s}'.format(classifier, stcl.name)
            for classifier, stcl in meta.features_used
        ]
        class_names = np.unique([
            '{0:s}'.format(stcl.name)
            for classifier, stcl in meta.features_used
        ])

        tsetclass = starclass.get_trainingset('keplerq9v2')
        tset = tsetclass()
        fitlabels = tset.labels()

        # Create table of features, just like it is done in the classifier:
        features = meta.build_features_table(tset.features(), total=len(tset))

        X_train, X_test, y_train, y_test = train_test_split(features,
                                                            fitlabels,
                                                            test_size=0.1,
                                                            random_state=42)

        explainer = shap.TreeExplainer(meta.classifier)
        shap_values = explainer.shap_values(X_test)

        fig = shap.summary_plot(shap_values,
Ejemplo n.º 12
0
def main():
	# Parse command line arguments:
	parser = argparse.ArgumentParser(description='Run TESS Corrections in parallel using MPI.')
	parser.add_argument('-d', '--debug', help='Print debug messages.', action='store_true')
	parser.add_argument('-q', '--quiet', help='Only report warnings and errors.', action='store_true')
	parser.add_argument('-o', '--overwrite', help='Overwrite existing results.', action='store_true')
	parser.add_argument('-c', '--classifier', help='Classifier to use.', default=None, choices=starclass.classifier_list)
	parser.add_argument('-t', '--trainingset', help='Train classifier using this training-set.', default='keplerq9v3', choices=starclass.trainingset_list)
	parser.add_argument('--linfit', help='Enable linfit in training set.', action='store_true')
	parser.add_argument('-l', '--level', help='Classification level', default='L1', choices=('L1', 'L2'))
	#parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"?
	# Lightcurve truncate override switch:
	group = parser.add_mutually_exclusive_group(required=False)
	group.add_argument('--truncate', dest='truncate', action='store_true', help='Force light curve truncation.')
	group.add_argument('--no-truncate', dest='truncate', action='store_false', help='Force no light curve truncation.')
	parser.set_defaults(truncate=None)
	# Input folder:
	parser.add_argument('input_folder', type=str, help='Input directory. This directory should contain a TODO-file and corresponding lightcurves.', nargs='?', default=None)
	args = parser.parse_args()

	# Get input and output folder from environment variables:
	input_folder = args.input_folder
	if input_folder is None:
		input_folder = os.environ.get('STARCLASS_INPUT')
	if not input_folder:
		parser.error("Please specify an INPUT_FOLDER.")
	if not os.path.exists(input_folder):
		parser.error("INPUT_FOLDER does not exist")
	if os.path.isdir(input_folder):
		todo_file = os.path.join(input_folder, 'todo.sqlite')
	else:
		todo_file = os.path.abspath(input_folder)
		input_folder = os.path.dirname(input_folder)

	# Initialize the training set:
	tsetclass = starclass.get_trainingset(args.trainingset)
	tset = tsetclass(level=args.level, linfit=args.linfit)

	# Define MPI message tags
	tags = enum.IntEnum('tags', ('READY', 'DONE', 'EXIT', 'START'))

	# Initializations and preliminaries
	comm = MPI.COMM_WORLD   # get MPI communicator object
	size = comm.size        # total number of processes
	rank = comm.rank        # rank of this process
	status = MPI.Status()   # get MPI status object

	if rank == 0:
		try:
			with starclass.TaskManager(todo_file, cleanup=True, overwrite=args.overwrite, classes=tset.StellarClasses) as tm:
				# Get list of tasks:
				#numtasks = tm.get_number_tasks()
				#tm.logger.info("%d tasks to be run", numtasks)

				# Number of available workers:
				num_workers = size - 1

				# Create a set of initial classifiers to initialize the workers as:
				initial_classifiers = []
				for k, c in enumerate(itertools.cycle(tm.all_classifiers)):
					if k >= num_workers: break
					initial_classifiers.append(c)

				tm.logger.info("Initial classifiers: %s", initial_classifiers)

				# Start the master loop that will assign tasks
				# to the workers:
				closed_workers = 0
				tm.logger.info("Master starting with %d workers", num_workers)
				while closed_workers < num_workers:
					# Ask workers for information:
					data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
					source = status.Get_source()
					tag = status.Get_tag()

					if tag == tags.DONE:
						# The worker is done with a task
						tm.logger.info("Got data from worker %d: %s", source, data)
						tm.save_results(data)

					if tag in (tags.DONE, tags.READY):
						# Worker is ready, so send it a task
						# If provided, try to find a task that is with the same classifier
						cl = initial_classifiers[source-1] if data is None else data.get('classifier')
						task = tm.get_task(classifier=cl, change_classifier=True)
						if task:
							tm.start_task(task)
							comm.send(task, dest=source, tag=tags.START)
							tm.logger.info("Sending task %d to worker %d", task['priority'], source)
						else:
							comm.send(None, dest=source, tag=tags.EXIT)

					elif tag == tags.EXIT:
						# The worker has exited
						tm.logger.info("Worker %d exited.", source)
						closed_workers += 1

					else: # pragma: no cover
						# This should never happen, but just to
						# make sure we don't run into an infinite loop:
						raise Exception("Master received an unknown tag: '{0}'".format(tag))

				tm.logger.info("Master finishing")

		except: # noqa: E722, pragma: no cover
			# If something fails in the master
			print(traceback.format_exc().strip())
			comm.Abort(1)

	else:
		# Worker processes execute code below
		# Configure logging within starclass:
		formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
		console = logging.StreamHandler()
		console.setFormatter(formatter)
		logger = logging.getLogger('starclass')
		logger.addHandler(console)
		logger.setLevel(logging.WARNING)

		# Get the class for the selected method:
		current_classifier = None
		stcl = None

		try:
			# Send signal that we are ready for task:
			comm.send(None, dest=0, tag=tags.READY)

			while True:
				# Receive a task from the master:
				tic_wait = default_timer()
				task = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
				tag = status.Get_tag()
				toc_wait = default_timer()

				if tag == tags.START:
					result = task.copy()

					# Run the classification prediction:
					try:
						if task['classifier'] != current_classifier or stcl is None:
							current_classifier = task['classifier']
							if stcl:
								stcl.close()
							stcl = starclass.get_classifier(current_classifier)
							stcl = stcl(tset=tset, features_cache=None, truncate_lightcurves=args.truncate)

						fname = os.path.join(input_folder, task['lightcurve'])
						features = stcl.load_star(task, fname)

						tic_predict = default_timer()
						result['starclass_results'] = stcl.classify(features)
						toc_predict = default_timer()

						result['elaptime'] = toc_predict - tic_predict
						result['status'] = starclass.STATUS.OK
					except: # noqa: E722, pragma: no cover
						# Something went wrong
						error_msg = traceback.format_exc().strip()
						result.update({
							'status': starclass.STATUS.ERROR,
							'details': {'errors': [error_msg]},
						})

					# Pad results with metadata and return to TaskManager to be saved:
					result.update({
						'tset': tset.key,
						'worker_wait_time': toc_wait - tic_wait
					})

					# Send the result back to the master:
					comm.send(result, dest=0, tag=tags.DONE)

					# Attempt some cleanup:
					# TODO: Is this even needed?
					del task, result

				elif tag == tags.EXIT:
					# We were told to EXIT, so lets do that
					break

				else: # pragma: no cover
					# This should never happen, but just to
					# make sure we don't run into an infinite loop:
					raise Exception("Worker received an unknown tag: '{0}'".format(tag))

		except: # noqa: E722, pragma: no cover
			logger.exception("Something failed in worker")

		finally:
			comm.send(None, dest=0, tag=tags.EXIT)
Ejemplo n.º 13
0
def main():
    # Parse command line arguments:
    parser = argparse.ArgumentParser(
        description='Utility function for training stellar classifiers.')
    parser.add_argument('-d',
                        '--debug',
                        help='Print debug messages.',
                        action='store_true')
    parser.add_argument('-q',
                        '--quiet',
                        help='Only report warnings and errors.',
                        action='store_true')
    parser.add_argument('-o',
                        '--overwrite',
                        help='Overwrite existing results.',
                        action='store_true')
    parser.add_argument('--log',
                        type=str,
                        default=None,
                        metavar='{LOGFILE}',
                        help="Log to file.")
    parser.add_argument(
        '--log-level',
        type=str,
        default=None,
        choices=['debug', 'info', 'warning', 'error'],
        help=
        "Logging level to use in file-logging. If not set, use the same level as the console."
    )
    parser.add_argument('--clear-cache',
                        help='Clear existing features cache before running.',
                        action='store_true')
    # Option to select which classifier to train:
    parser.add_argument('-c',
                        '--classifier',
                        default='meta',
                        choices=starclass.classifier_list,
                        metavar='{CLASSIFIER}',
                        help='Classifier to train. Choises are ' +
                        ", ".join(starclass.classifier_list) + '.')
    # Option to select training set:
    parser.add_argument(
        '-t',
        '--trainingset',
        default='keplerq9v3',
        choices=starclass.trainingset_list,
        metavar='{TSET}',
        help='Train classifier using this training-set. Choises are ' +
        ", ".join(starclass.trainingset_list) + '.')

    parser.add_argument('-l',
                        '--level',
                        help='Classification level',
                        default='L1',
                        choices=('L1', 'L2'))
    parser.add_argument('--linfit',
                        help='Enable linfit in training set.',
                        action='store_true')
    #parser.add_argument('--datalevel', help="", default='corr', choices=('raw', 'corr')) # TODO: Come up with better name than "datalevel"?
    parser.add_argument('-tf',
                        '--testfraction',
                        type=float,
                        default=0.0,
                        help='Holdout/test-set fraction')
    parser.add_argument(
        '--output',
        type=str,
        default=None,
        help=
        'Directory where trained models and diagnostics will be saved. Default is to save in the programs data directory.'
    )
    args = parser.parse_args()

    # Check args
    if args.testfraction < 0 or args.testfraction >= 1:
        parser.error('Testfraction must be between 0 and 1')

    # Set logging level:
    logging_level = logging.INFO
    if args.quiet:
        logging_level = logging.WARNING
    elif args.debug:
        logging_level = logging.DEBUG

    # Setup logging:
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    console.setLevel(logging_level)
    logger = logging.getLogger(__name__)
    logger.addHandler(console)
    logger_parent = logging.getLogger('starclass')
    logger_parent.addHandler(console)
    # Add log-file if the user asked for it:
    if args.log is not None:
        filehandler = logging.FileHandler(args.log, mode='w', encoding='utf8')
        filehandler.setFormatter(formatter)
        filehandler.setLevel(logging_level if args.log_level is None else args.
                             log_level.upper())
        logging_level = min(logging_level, filehandler.level)
        logger.addHandler(filehandler)
        logger_parent.addHandler(filehandler)
    # The logging level of the logger objects needs to be the smallest
    # logging level enabled in either of the handlers:
    logger.setLevel(logging_level)
    logger_parent.setLevel(logging_level)

    # Pick the training set:
    tsetclass = starclass.get_trainingset(args.trainingset)
    tset = tsetclass(level=args.level,
                     tf=args.testfraction,
                     linfit=args.linfit)

    # If we were asked to do so, clear the cache before proceding:
    if args.clear_cache:
        tset.clear_cache()

    # The Meta-classifier requires us to first train all of the other classifiers
    # using cross-validation
    if args.classifier == 'meta':
        # Loop through all the other classifiers and initialize them:
        # TODO: Run in parallel?
        # TODO: Check if results are already present
        with starclass.TaskManager(tset.todo_file,
                                   overwrite=args.overwrite,
                                   classes=tset.StellarClasses) as tm:
            # Loop through all classifiers, excluding the MetaClassifier:
            for cla_key in tm.all_classifiers:
                # Split the tset object into cross-validation folds.
                # These are objects with exactly the same properties as the original one,
                # except that they will run through different subsets of the training and test sets:
                cla = starclass.get_classifier(cla_key)
                for tset_fold in tset.folds(n_splits=5, tf=0.2):
                    with cla(tset=tset_fold,
                             features_cache=tset.features_cache,
                             data_dir=args.output) as stcl:
                        logger.info('Training %s on Fold %d/%d...',
                                    stcl.classifier_key, tset_fold.fold,
                                    tset_fold.crossval_folds)
                        stcl.train(tset_fold)
                        logger.info("Training done.")
                        logger.info("Classifying test-set using %s...",
                                    stcl.classifier_key)
                        stcl.test(tset_fold, save=tm.save_results)

                # Now train all classifiers on the full training-set (minus the holdout-set),
                # and test on the holdout set:
                with cla(tset=tset,
                         features_cache=tset.features_cache,
                         data_dir=args.output) as stcl:
                    logger.info('Training %s on full training-set...',
                                stcl.classifier_key)
                    stcl.train(tset)
                    logger.info("Training done.")
                    logger.info("Classifying holdout-set using %s...",
                                stcl.classifier_key)
                    stcl.test(tset,
                              save=tm.save_results,
                              feature_importance=True)

    # Initialize the classifier:
    classifier = starclass.get_classifier(args.classifier)
    with starclass.TaskManager(tset.todo_file,
                               overwrite=False,
                               classes=tset.StellarClasses) as tm:
        with classifier(tset=tset,
                        features_cache=tset.features_cache,
                        data_dir=args.output) as stcl:
            # Run the training of the classifier:
            logger.info("Training %s on full training-set...", args.classifier)
            stcl.train(tset)
            logger.info("Training done.")
            logger.info("Classifying holdout-set using %s...", args.classifier)
            stcl.test(tset, save=tm.save_results, feature_importance=True)