def create_source_table(database_model_name, output_path, format=None, overwrite=False, filter_by_kwargs=None, limit=None): """ Create a table of sources and their results from the database. If there are mutiple results per source (e.g., from individual visits), then only the first result is included. Parameters ---------- database_model_name : str Name of the database model to query. output_path : str Path to the output file. format : str Format of the output file. filter_by_kwargs : dict Keyword arguments to pass to the database query. limit : int Limit the number of results. """ rows = _get_results(database_model_name, spectrum_index=0, filter_by_kwargs=filter_by_kwargs, limit=limit) table = Table(rows=rows) table.write(output_path, format=format, overwrite=overwrite) log.info(f"Wrote source table with {len(table)} rows to {output_path}") return None
def create_visit_table(database_model_name, output_path, format=None, overwrite=False, filter_by_kwargs=None, limit=None): """ Create a table of visits and their results from the database. Parameters ---------- database_model_name : str Name of the database model to query. output_path : str Path to the output file. format : str Format of the output file. filter_by_kwargs : dict Keyword arguments to pass to the database query. limit : int Limit the number of results. """ rows = _get_results(database_model_name, spectrum_index=None, filter_by_kwargs=filter_by_kwargs, limit=limit) table = Table(rows=rows) table.write(output_path, format=format, overwrite=overwrite) log.info(f"Wrote visit table with {len(table)} rows to {output_path}") return None
def _unlink_primary_key_path(self): try: primary_key_path = self._primary_key_path except AttributeError: None else: log.info(f"Removing temporary file at {primary_key_path}") os.unlink(primary_key_path) return None
def _execute_ferre_by_slurm(directory, total, offile, interval=60, **kwargs): from slurm import queue as SlurmQueue label = "ferre" queue = SlurmQueue(verbose=True) queue.create( label=label, **kwargs ) queue.append(_ferre_executable, dir=directory) queue.commit(hard=True, submit=True) log.info(f"Slurm job submitted with {queue.key} and keywords {kwargs} to run {_ferre_executable} in {directory}") log.info(f"\tJob directory: {queue.job_dir}") # stdout_path = os.path.join(directory, f"{label}_01.o") stderr_path = os.path.join(directory, f"{label}_01.e") output_flux_path = os.path.join(directory, offile) # Now we wait until the Slurm job is complete. t_init, t_to_start = (time(), None) while 100 > queue.get_percent_complete(): sleep(interval) t = time() - t_init if not os.path.exists(stderr_path) and not os.path.exists(stdout_path): log.info(f"Waiting on job {queue.key} to start (elapsed: {t / 60:.0f} min)") else: log.info(f"Job in {queue.key} has started") total_done = 0 with tqdm(total=total, desc="FERRE", unit="spectra") as pb: while total_done < total: n_done = _check_ferre_progress(output_flux_path) pb.update(n_done - total_done) total_done = n_done pb.refresh() sleep(interval) log.info("Finishing up.") with open(stdout_path, "r") as fp: stdout = fp.read() with open(stderr_path, "r") as fp: stderr = fp.read() return (stdout, stderr)
def infer_releases(self, context): """ Infer the SDSS release(s) to use based on the execution context. :param context: The Airflow context dictionary. """ releases = infer_releases(context["ds"], context["next_ds"]) log.info( f"Between {context['ds']} and {context['next_ds']} the relevant SDSS releases are {releases}" ) return releases
def validate_slug(slug): r""" Validate a given GitHub repository slug, given some input. :param slug: The given slug string, which should be in the form '{OWNER}/{REPO}'. If no '{OWNER}' is given, it will be assumed to be owned by the SDSS organization. """ slug = f"{slug}".strip().lower() if "/" not in slug: log.info(f"Assuming GitHub repository '{slug}' is owned by SDSS (sdss/{slug})") slug = f"sdss/{slug}" return slug
def add_meta_to_task_instances_without_meta(): """ Add meta to task instances without meta. """ failed, total = (0, count_task_instances_without_meta()) for pk in tqdm(yield_task_instance_pks_without_meta(), total=total, desc="Adding metadata to task instances"): try: add_meta_to_task_instance(pk) except: log.exception(f"Unable to add meta to task instance with pk {pk}") continue log.info(f"Added meta to {total - failed} task instances") return None
def run(self): # Load training set labels and spectra. labels, dispersion, training_set_flux, training_set_ivar = read_training_set( self.input().path, self.default_inverse_variance) # Set the vectorizer. # We sort the label names so that luigi doesn't re-train models if we alter the order. vectorizer = tc.vectorizer.PolynomialVectorizer( sorted(self.label_names), self.order) # Initiate model. model = tc.model.CannonModel(labels, training_set_flux, training_set_ivar, vectorizer=vectorizer, dispersion=dispersion, regularization=self.regularization) log.info(f"Training The Cannon model {model}") model.train(threads=self.threads) output_path = self.output().path log.info(f"Writing The Cannon model {model} to disk {output_path}") model.write(output_path) if self.plot: # Plot zeroth and first order coefficients. from astra_thecannon import plot fig = plot.theta( model, indices=np.arange(1 + len(model.vectorizer.label_names)), normalize=False) fig.savefig(f"{self.output_prefix}-theta.png") # Plot scatter. fig = plot.scatter(model) fig.savefig(f"{self.output_prefix}-scatter.png") # Plot one-to-one. test_labels, test_cov, test_meta = model.test( training_set_flux, training_set_ivar, initial_labels=model.training_set_labels) fig = plot.one_to_one(model, test_labels, cov=test_cov) fig.savefig(f"{self.output_prefix}-one-to-one.png")
def classify(pks, **kwargs): """ Classify sources given the primary keys of task instances. :param pks: the primary keys of the task instances in the database that need classification """ models = {} results = {} for instance, path, spectrum in prepare_data(pks): if spectrum is None: continue model_path = instance.parameters["model_path"] try: model, factory = models[model_path] except KeyError: network_factory = model_path.split("_")[-2] factory = getattr(networks, network_factory) log.info(f"Loading model from {model_path} using {factory}") model = utils.read_network(factory, model_path) model.eval() models[model_path] = (model, factory) flux = torch.from_numpy(spectrum.flux.value.astype(np.float32)) with torch.no_grad(): prediction = model.forward( flux) #Variable(torch.Tensor(spectrum.flux.value))) log_probs = prediction.cpu().numpy().flatten() results[instance.pk] = log_probs for pk, log_probs in tqdm(results.items(), desc="Writing results"): result = _prepare_log_prob_result(factory.class_names, log_probs) # Write the output to the database. create_task_output(pk, astradb.Classification, **result)
def _init_progressbar(self, N, message=None): """ Initialise a progressbar. :param N: The number of items that will be iterated over. :param message: [optional] An information message to log before showing the progressbar. """ self.N = int(N) if self.N < 0: return try: rows, columns = os.popen('stty size', 'r').read().split() except: log.debug("Couldn't get screen size. Progressbar may look odd.") self.W = 100 else: self.W = min(100, int(columns) - (12 + 21 + 2 * len(str(self.N)))) self.t_init = time() self.message = message if 0 >= self.N: return None if message is not None: log.info(message.rstrip()) sys.stdout.flush() with _counter_lock: _counter.value = 0
def execute(self, context): """ Execute the operator. :param context: The Airflow DAG execution context. """ if self.slurm_kwargs: # Serialize the primary keys. if len(self.pks) > 1: primary_key_path = serialize_pks_to_path(self.pks, dir=get_scratch_dir()) log.info( f"Serialized {len(self.pks)} primary keys to {primary_key_path}. First 10 primary keys are {self.pks[:10]}" ) # Store the primary key path, because we will clean up later. self._primary_key_path = primary_key_path bash_command = f"astra execute {primary_key_path}" else: bash_command = f"astra execute {self.pks[0]}" self.execute_by_slurm(context, bash_command) else: # This is essentially what "astra execute [PK]" does. function = string_to_callable(self.python_callable) result = function(self.pks, **(self.op_kwargs or dict())) log.info( f"Result from {function} with op kwargs {self.op_kwargs} was: {result}" ) return self.pks
def _estimate_stellar_labels(pk): # TODO: It would be great if these were stored with the network, # instead of being hard-coded. label_names = ["teff", "logg", "vsini", "v_micro", "m_h"] # Translate: _t = { "teff": "T_eff", "logg": "log(g)", "m_h": "[M/H]", "vsini": "v*sin(i)", } # TODO: This implicitly assumes that the same constraints and network path are used by all the # primary keys given. This is the usual case, but we should check this, and code around it. # TODO: This implementation requires knowing the observed spectrum before loading data. # This is fine for ApStar objects since they all have the same dispersion sampling, # but will not be fine for dispersion sampling that differs in each observation. # Let's peak ahead at the first valid spectrum we can find. instance, _, spectrum = next(prepare_data([pk])) if spectrum is None: # No valid spectrum. log.warning( f"Cannot build LSF for fitter because no spectrum found for primary key {pk}" ) return None network = Network() network.read_in(instance.parameters["network_path"]) constraints = json.loads(instance.parameters.get("constraints", "{}")) fitted_label_names = [ ln for ln in label_names \ if network.grid[_t.get(ln, ln)][0] != network.grid[_t.get(ln, ln)][1] ] L = len(fitted_label_names) bounds_unscaled = np.zeros((2, L)) for i, ln in enumerate(fitted_label_names): bounds_unscaled[:, i] = constraints.get(ln, network.grid[_t.get(ln, ln)][:2]) fit = Fit(network, int(instance.parameters["N_chebyshev"])) fit.bounds_unscaled = bounds_unscaled spectral_resolution = int(instance.parameters["spectral_resolution"]) fit.lsf = LSF_Fixed_R(spectral_resolution, spectrum.wavelength.value, network.wave) # Note the Stramut code uses inconsistent naming for "presearch", but in the operator interface we use # 'pre_search' in all situations. That's why there is some funny naming translation here. fit.N_presearch_iter = int(instance.parameters["N_pre_search_iter"]) fit.N_pre_search = int(instance.parameters["N_pre_search"]) fitter = UncertFit(fit, spectral_resolution) N, P = spectrum.flux.shape keys = [] keys.extend(fitted_label_names) keys.extend([f"u_{ln}" for ln in fitted_label_names]) keys.extend(["v_rad", "u_v_rad", "chi2", "theta"]) result = {key: [] for key in keys} result["snr"] = spectrum.meta["snr"] model_fluxes = [] log.info(f"Running ThePayne-Che on {N} spectra for {instance}") for i in range(N): flux = spectrum.flux.value[i] error = spectrum.uncertainty.array[0]**-0.5 # TODO: No NaNs/infs are allowed, but it doesn't seem like that was an issue for Stramut's code. # Possibly due to different versions of scipy. In any case, raise this as a potential bug, # since the errors do not always seem to be believed by ThePayne-Che. bad = (~np.isfinite(flux)) | (error <= 0) flux[bad] = 0 error[bad] = 1e10 fit_result = fitter.run( spectrum.wavelength.value, flux, error, ) # The `popt` attribute is length: len(label_names) + 1 (for radial velocity) + N_chebyshev # Relevent attributes are: # - fit_result.popt # - fit_result.uncert # - fit_result.RV_uncert # - fit_result.model for j, label_name in enumerate(fitted_label_names): result[label_name].append(fit_result.popt[j]) result[f"u_{label_name}"].append(fit_result.uncert[j]) result["theta"].append(fit_result.popt[L + 1:].tolist()) result["chi2"].append(fit_result.chi2_func(fit_result.popt)) result["v_rad"].append(fit_result.popt[L]) result["u_v_rad"].append(fit_result.RV_uncert) model_fluxes.append(fit_result.model) # Write database result. create_task_output(instance, astradb.ThePayneChe, **result) # TODO: Write AstraSource object here. return None
def estimate_stellar_labels(pks, default_num_uncertainty_draws=100, default_large_error=1e10): """ Estimate the stellar parameters for APOGEE ApStar observations, where task instances have been created with the given primary keys (`pks`). :param pks: The primary keys of task instances that include information about what ApStar observation to load. :param default_num_uncertainty_draws: [optional] The number of random draws to make of the flux uncertainties, which will be propagated into the estimate of the stellar parameter uncertainties (default: 100). :param default_large_error: [optional] An arbitrarily large error value to assign to bad pixels (default: 1e10). """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") log.info(f"Running APOGEENet on device {device} with:") log.info(f"\tpks: {pks}") log.debug( f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'") log.debug(f"Using torch version {torch.__version__} in {torch.__path__}") models = {} pks = deserialize_pks(pks, flatten=True) total = len(pks) log.info(f"There are {total} primary keys to process: {pks}") for instance, path, spectrum in tqdm(prepare_data(pks), total=total): if spectrum is None: continue model_path = instance.parameters["model_path"] # Load the model. try: model = models[model_path] except KeyError: log.info(f"Loaded model from {model_path}") models[model_path] = model = Model(model_path, device) N, P = spectrum.flux.shape # Build metadata array. metadata_keys, metadata, metadata_norm = get_metadata(spectrum) flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape( (N, 1, P)) meta = np.tile(metadata_norm, N).reshape((N, -1)) flux = torch.from_numpy(flux).to(device) meta = torch.from_numpy(meta).to(device) with torch.set_grad_enabled(False): predictions = model.predict_spectra(flux, meta) if device != "cpu": predictions = predictions.cpu().data.numpy() # Replace infinites with non-finite. predictions[~np.isfinite(predictions)] = np.nan # Create results array. log_g, log_teff, fe_h = predictions.T teff = 10**log_teff result = dict( snr=spectrum.meta["snr"], teff=teff.tolist(), logg=log_g.tolist(), fe_h=fe_h.tolist(), ) num_uncertainty_draws = int( instance.parameters.get("num_uncertainty_draws", default_num_uncertainty_draws)) if num_uncertainty_draws > 0: large_error = float( instance.parameters.get("large_error", default_large_error)) flux_error = np.nan_to_num( spectrum.uncertainty.array**-0.5).astype(np.float32).reshape( (N, 1, P)) median_error = 5 * np.median(flux_error, axis=(1, 2)) for j, value in enumerate(median_error): bad_pixel = (flux_error[j] == large_error) | (flux_error[j] >= value) flux_error[j][bad_pixel] = value flux_error = torch.from_numpy(flux_error).to(device) inputs = torch.randn((num_uncertainty_draws, N, 1, P), device=device) * flux_error + flux inputs = inputs.reshape((num_uncertainty_draws * N, 1, P)) meta_error = meta.repeat(num_uncertainty_draws, 1) with torch.set_grad_enabled(False): draws = model.predict_spectra(inputs, meta_error) if device != "cpu": draws = draws.cpu().data.numpy() draws = draws.reshape((num_uncertainty_draws, N, -1)) # Need to put the log(teffs) to teffs before calculating std_dev draws[:, :, 1] = 10**draws[:, :, 1] median_draw_predictions = np.nanmedian(draws, axis=0) std_draw_predictions = np.nanstd(draws, axis=0) log_g_median, teff_median, fe_h_median = median_draw_predictions.T log_g_std, teff_std, fe_h_std = std_draw_predictions.T result.update(_teff_median=teff_median.tolist(), _logg_median=log_g_median.tolist(), _fe_h_median=fe_h_median.tolist(), u_teff=teff_std.tolist(), u_logg=log_g_std.tolist(), u_fe_h=fe_h_std.tolist()) else: median_draw_predictions, std_draw_predictions = (None, None) # Add the bitmask flag. bitmask_flag = create_bitmask( predictions, median_draw_predictions=median_draw_predictions, std_draw_predictions=std_draw_predictions) result.update(bitmask_flag=bitmask_flag.tolist()) # Write the result to database. create_task_output(instance, astradb.ApogeeNet, **result) log.info(f"Completed processing of {total} primary keys")
def execute_by_slurm(self, context, bash_command, directory=None, poke_interval=60): uid = str(uuid.uuid4())[:8] label = ".".join([ context["dag"].dag_id, context["task"].task_id, context["execution_date"].strftime('%Y-%m-%d'), # run_id is None if triggered by command line uid ]) if len(label) > 64: log.warning( f"Truncating Slurm label ({label}) to 64 characters: {label[:64]}" ) label = label[:64] self._slurm_label = label # It's bad practice to import here, but the slurm package is # not easily installable outside of Utah, and is not a "must-have" # requirement. from slurm import queue # TODO: HACK to be able to use local astra installation while in development if bash_command.startswith("astra "): bash_command = f"/uufs/chpc.utah.edu/common/home/u6020307/.local/bin/astra {bash_command[6:]}" slurm_kwargs = (self.slurm_kwargs or dict()) log.info( f"Submitting Slurm job {label} with command:\n\t{bash_command}\nAnd Slurm keyword arguments: {slurm_kwargs}" ) q = queue(verbose=True) q.create(label=label, dir=directory, **slurm_kwargs) q.append(bash_command) try: q.commit(hard=True, submit=True) except CalledProcessError as e: log.exception( f"Exception occurred when committing Slurm job with output:\n{e.output}" ) raise log.info( f"Slurm job submitted with {q.key} and keywords {slurm_kwargs}") log.info(f"\tJob directory: {directory or q.job_dir}") stdout_path = os.path.join(directory or q.job_dir, f"{label}_01.o") stderr_path = os.path.join(directory or q.job_dir, f"{label}_01.e") # Now we wait until the Slurm job is complete. t_submitted, t_started = (time(), None) while 100 > q.get_percent_complete(): sleep(poke_interval) t = time() - t_submitted if not os.path.exists(stderr_path) and not os.path.exists( stdout_path): log.info( f"Waiting on job {q.key} to start (elapsed: {t / 60:.0f} min)" ) else: # Check if this is the first time it has started. if t_started is None: t_started = time() log.debug( f"Recording job {q.key} as starting at {t_started} (took {t / 60:.0f} min to start)" ) log.info( f"Waiting on job {q.key} to finish (elapsed: {t / 60:.0f} min)" ) # Open last line of stdout path? # If this has been going much longer than the walltime, then something went wrong. # TODO: Check on the status of the job from Slurm. log.info( f"Job {q.key} in {q.job_dir} is complete after {(time() - t_submitted)/60:.0f} minutes." ) with open(stderr_path, "r", newline="\n") as fp: stderr = fp.read() log.info(f"Contents of {stderr_path}:\n{stderr}") with open(stdout_path, "r", newline="\n") as fp: stdout = fp.read() log.info(f"Contents of {stdout_path}:\n{stdout}") # TODO: Better parsing for critical errors. if "Error" in stdout.rstrip().split("\n")[-1] \ or "Error" in stderr.rstrip().split("\n")[-1]: raise RuntimeError(f"detected exception at task end-point") # TODO: Get exit codes from squeue return None
def get_best_result(task, ti, **kwargs): """ When there are numerous FERRE tasks that are upstream, this function will return the primary keys of the task instances that gave the best result on a per-observation basis. """ # Get the PKs from upstream. pks = [] log.debug(f"Upstream tasks: {task.upstream_list}") for upstream_task in task.upstream_list: pks.append(ti.xcom_pull(task_ids=upstream_task.task_id)) pks = flatten(pks) log.debug(f"Getting best initial guess among primary keys {pks}") # Need to uniquely identify observations. param_bit_mask = bitmask.ParamBitMask() bad_grid_edge = (param_bit_mask.get_value("GRIDEDGE_WARN") | param_bit_mask.get_value("GRIDEDGE_BAD")) trees = {} best_tasks = {} for i, pk in enumerate(pks): q = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk==pk) instance = q.one_or_none() if instance.output is None: log.warning(f"No output found for task instance {instance}") continue p = instance.parameters # Check that the telescope is the same as what we expect from this task ID. # This is a bit of a hack. Let us explain. # The "BA" grid does not have a telescope/fiber model, so you can run LCO and APO # data through the initial-BA grid. And those outputs go to the "get_best_results" # for each of the APO and LCO tasks (e.g., this function). # If there is only APO data, then the LCO "get_best_result" will only have one # input: the BA results. Then it will erroneously think that's the best result # for that source. # It's hacky to put this logic in here. It should be in the DAG instead. Same # thing for parsing 'telescope' name in the DAG (eg 'APO') from 'apo25m'. this_telescope_short_name = p["telescope"][:3].upper() expected_telescope_short_name = task.task_id.split(".")[1] log.info(f"For instance {instance} we have {this_telescope_short_name} and {expected_telescope_short_name}") if this_telescope_short_name != expected_telescope_short_name: continue try: tree = trees[p["release"]] except KeyError: tree = trees[p["release"]] = SDSSPath(release=p["release"]) key = "_".join([ p['release'], p['filetype'], *[p[k] for k in tree.lookup_keys(p['filetype'])] ]) best_tasks.setdefault(key, (np.inf, None)) # TODO: Confirm that this is base10 log. This should also be 'log_reduced_chisq_fit', # according to the documentation. log_chisq_fit, *_ = instance.output.log_chisq_fit previous_teff, *_ = instance.output.teff bitmask_flag, *_ = instance.output.bitmask_flag log.debug(f"Result {instance} {instance.output} with log_chisq_fit = {log_chisq_fit} and {previous_teff} and {bitmask_flag}") # Note: If FERRE totally fails then it will assign -999 values to the log_chisq_fit. So we have to # check that the log_chisq_fit is actually sensible! # (Or we should only query task instances where the output is sensible!) if log_chisq_fit < 0: # TODO: This is a f*****g hack. log.debug(f"Skipping result for {instance} {instance.output} as log_chisq_fit = {log_chisq_fit}") continue parsed_header = utils.parse_header_path(p["header_path"]) # Penalise chi-sq in the same way they did for DR17. # See github.com/sdss/apogee/python/apogee/aspcap/aspcap.py#L658 if parsed_header["spectral_type"] == "GK" and previous_teff < 3900: log.debug(f"Increasing \chisq because spectral type GK") log_chisq_fit += np.log10(10) bitmask_flag_logg, bitmask_flag_teff = bitmask_flag[-2:] if bitmask_flag_logg & bad_grid_edge: log.debug(f"Increasing \chisq because logg flag is bad edge") log_chisq_fit += np.log10(5) if bitmask_flag_teff & bad_grid_edge: log.debug(f"Increasing \chisq because teff flag is bad edge") log_chisq_fit += np.log10(5) # Is this the best so far? if log_chisq_fit < best_tasks[key][0]: log.debug(f"Assigning this output to best task as {log_chisq_fit} < {best_tasks[key][0]}: {pk}") best_tasks[key] = (log_chisq_fit, pk) for key, (log_chisq_fit, pk) in best_tasks.items(): if pk is None: log.warning(f"No good task found for key {key}: ({log_chisq_fit}, {pk})") else: log.info(f"Best task for key {key} with \chi^2 of {log_chisq_fit:.2f} is primary key {pk}") if best_tasks: return [pk for (log_chisq_fit, pk) in best_tasks.values() if pk is not None] else: raise AirflowSkipException(f"no task outputs found from {len(pks)} primary keys")
def estimate_radial_velocity(pks, verbose=True, mcmc=False, figfile=None, cornername=None, retpmodels=False, plot=False, tweak=True, usepeak=False, maxvel=[-1000, 1000]): """ Estimate radial velocities for the sources that are identified by the task instances of the given primary keys. :param pks: The primary keys of task instances to estimate radial velocities for, which includes parameters to identify the source SDSS data model product. See `doppler.rv.fit` for more information on other keyword arguments. """ # TODO: Move this to astra/contrib import doppler log.info(f"Estimating radial velocities for {len(pks)} task instances") failures = [] for instance, path, spectrum in prepare_data(pks): if spectrum is None: continue log.debug(f"Running Doppler on {instance} from {path}") try: spectrum = doppler.read(path) summary, model_spectrum, modified_input_spectrum = doppler.rv.fit( spectrum, verbose=verbose, mcmc=mcmc, figfile=figfile, cornername=cornername, retpmodels=retpmodels, plot=plot, tweak=tweak, usepeak=usepeak, maxvel=maxvel) except: log.exception( f"Exception occurred on Doppler on {path} with task instance {instance}" ) failures.append(instance.pk) continue else: # Write the output to the database. results = prepare_results(summary) create_task_output(instance, astradb.Doppler, **results) if len(failures) > 0: log.warning( f"There were {len(failures)} Doppler failures out of a total {len(pks)} executions." ) log.warning(f"Failed primary keys include: {failures}") log.warning(f"Raising last exception to indicate failure in pipeline.") raise
def train_polynomial_model(labels, data, order=2, regularization=0, threads=1): log.debug(f'Inputs are: ({type(labels)}) {labels}') log.debug(f'Data are: {data}') # labels could be in JSON format. if isinstance(labels, str): labels = json.loads(labels.replace("'", '"')) # TODO: use a general deserializer that fixes the single quote issues with json loading if isinstance(data, str) and os.path.exists(data): with open(data, "rb") as fp: data = pickle.load(fp) for key in ("dispersion", "wavelength"): try: dispersion = data[key] except KeyError: continue else: break else: raise ValueError(f"unable to find {key} in data") training_set_flux = data["normalized_flux"] training_set_ivar = data["normalized_ivar"] try: num_spectra = data["num_spectra"] except: log.debug( f"Keeping all items in training set; not checking for missing spectra." ) else: keep = (num_spectra == 1) if not all(keep): log.warning( f"Excluding {sum(~keep)} objects from the training set that had missing spectra" ) labels = {k: np.array(v)[keep] for k, v in labels.items()} training_set_flux = training_set_flux[keep] training_set_ivar = training_set_ivar[keep] # Set the vectorizer. vectorizer = tc.vectorizer.PolynomialVectorizer( labels.keys(), order=order, ) # Initiate model. model = tc.model.CannonModel(labels, training_set_flux, training_set_ivar, vectorizer=vectorizer, dispersion=dispersion, regularization=regularization) model.train(threads=threads) output_path = os.path.join(get_base_output_path(), "thecannon", "model.pkl") os.makedirs(os.path.dirname(output_path), exist_ok=True) log.info(f"Writing The Cannon model {model} to disk {output_path}") model.write(output_path, include_training_set_spectra=True, overwrite=True) return output_path
def parse_ferre_outputs(directory, header_path, wavelength, flux, sigma, mask, names, initial_parameters, kwds, meta, clean_up_on_exit=False, raise_exception_on_bad_outputs=False): # Get processing times. #processing_times = utils.get_processing_times(stdout, kwds["nthreads"]) # Parse parameter outputs and uncertainties. try: output_names, param, param_err, output_meta = utils.read_output_parameter_file( os.path.join(directory, kwds["opfile"]), n_dimensions=kwds["ndim"], full_covariance=kwds["covprint"] ) except: log.exception(f"Failed to load FERRE output parameter file at {os.path.join(directory, kwds['opfile'])}") raise # Parse flux outputs. try: model_flux = np.nan * np.ones_like(flux) model_flux[:, mask] = np.loadtxt(os.path.join(directory, kwds["offile"])) except: log.exception(f"Failed to load model flux from {os.path.join(directory, kwds['offile'])}:") raise if kwds.get("cont", None) is None: continuum = np.ones_like(model_flux) else: # Infer continuum. normalized_flux = np.nan * np.ones_like(flux) normalized_flux[:, mask] = np.loadtxt(os.path.join(directory, kwds["sffile"])) continuum = flux / normalized_flux meta.update( mask=mask, wavelength=wavelength, flux=flux, sigma=sigma, normalized_model_flux=model_flux, continuum=continuum ) # Flag things. P, L = param.shape param_bitmask = bitmask.ParamBitMask() bitmask_flag = np.zeros((P, L), dtype=np.int64) grid_headers, *segment_headers = utils.read_ferre_headers(utils.expand_path(header_path)) bad_lower = (grid_headers["LLIMITS"] + grid_headers["STEPS"]/8) bad_upper = (grid_headers["ULIMITS"] - grid_headers["STEPS"]/8) bitmask_flag[(param < bad_lower) | (param > bad_upper)] |= param_bitmask.get_value("GRIDEDGE_BAD") warn_lower = (grid_headers["LLIMITS"] + grid_headers["STEPS"]) warn_upper = (grid_headers["ULIMITS"] - grid_headers["STEPS"]) bitmask_flag[(param < warn_lower) | (param > warn_upper)] |= param_bitmask.get_value("GRIDEDGE_WARN") bitmask_flag[(param == -999) | (param_err < -0.01)] |= param_bitmask.get_value("FERRE_FAIL") # Check for any erroneous outputs if raise_exception_on_bad_outputs and np.any(bitmask_flag & param_bitmask.get_value("FERRE_FAIL")): v = bitmask_flag & param_bitmask.get_value("FERRE_FAIL") idx = np.where(np.any(bitmask_flag & param_bitmask.get_value("FERRE_FAIL"), axis=1)) raise ValueError(f"FERRE returned all erroneous values for an entry: {idx} {v}") # Include processing times and bitmask etc. meta.update( bitmask_flag=bitmask_flag.tolist(), # .tolist() for postgresql encoding. #processing_times=processing_times, **output_meta ) # need parameter names print(f"input names {names}") print(f"output_names: {output_names}") print(f"param: {param}") print(f"param_err: {param_err}") print(f"meta: {meta}") print(f"bitmask_flag: {bitmask_flag}") # Parse elapsed time. #print(f"times {processing_times}") if clean_up_on_exit: log.info(f"Removing directory {directory} and its contents") rmtree(directory) else: log.info(f"Leaving directory {directory} and its contents as clean_up_on_exit = {clean_up_on_exit}") return (param, param_err, meta)
def ferre( wavelength, flux, sigma, header_path, names=None, initial_parameters=None, frozen_parameters=None, interpolation_order=3, input_weights_path=None, input_lsf_shape_path=None, lsf_shape_flag=0, error_algorithm_flag=1, wavelength_interpolation_flag=0, optimization_algorithm_flag=3, continuum_flag=1, continuum_order=4, continuum_segment=None, continuum_reject=0.3, continuum_observations_flag=1, full_covariance=False, pca_project=False, pca_chi=False, n_threads=32, f_access=None, f_format=1, ferre_kwargs=None, directory=None, clean_up_on_exit=False, raise_exception_on_bad_outputs=False, **kwargs ): """ Run FERRE on the given observations and return the parsed outputs. :param wavelength: An array of wavelength values for the observations. This should be one of: - a 1D array of shape `P` where P is the number of pixels, if all spectra are on the same wavelength grid - an array of shape `(N, P)` where `N` is the number of observations and `P` is the number of pixels, if all spectra have the same number of pixels - a list of `N` arrays, where each array contains the number of pixels in that observation :param flux: The observed flux values. This should be one of: - an array of shape `(N, P)` where `N` is the number of observations and `P` is the number of pixels, if all spectra have the same number of pixels - a list of `N` arrays, where each array has a size of the number of pixels in that observation. :param sigma: The uncertainty in the observed flux values. This should be one of: - an array of shape `(N, P)` where `N` is the number of observations and `P` is the number of pixels, if all spectra have the same number of pixels - a list of `N` arrays, where each array has a size of the number of pixels in that observation :param header_path: The path of the FERRE header file. :param initial_parameters: [optional] The initial parameters to start from. If `None` is given then this will revert to the mid-point of the grid for all observations. This should be an array of shape `(N, L)` where `N` is the number of observations and `L` is the number of dimensions in the FERRE grid supplied. :param frozen_parameters: [optional] A dictionary with parameter names (as per the header file) as keys, and either a boolean flag or a float as value. If boolean `True` is given for a parameter, then the value will be fixed at the initial value per spectrum. If a float is given then this value will supercede all initial values given, fixing the dimension for all input spectra regardless of the initial value. :param interpolation_order: [optional] Order of interpolation to use (default: 1, as per FERRE). This corresponds to the FERRE keyword `inter`. 0. nearest neighbour 1. linear 2. quadratic Bezier 3. cubic Bezier 4. cubic splines :param input_weights_path: [optional] The location of a weight (or mask) file to apply to the pixels. This corresponds to the FERRE keyword `filterfile`. :para input_lsf_shape_path: [optional] The location of a file containing describing the line spread function to apply to the observations. This keyword is ignored if `lsf_shape_flag` is anything but 0. This corresponds to the FERRE keyword `lsffile`. :param lsf_shape_flag: [optional] A flag indicating what line spread convolution to perform. This should be one of: 0. no LSF convolution (default) 1. 1D (independent of wavelength), one and the same for all spectra 2. 2D (a function of wavelength), one and the same for all 3. 1D and Gaussian (i.e. described by a single parameter, its width), one for all objects 4. 2D and Gaussian, one for all objects 11. 1D and particular for each spectrum 12. 2D and particular for each spectrum 13. 1D Gaussian, but particular for each spectrum 14. 2D Gaussian and particular for each object. If `lsf_shape_flag` is anything but 0, then an `input_lsf_path` keyword argument will also be required, pointing to the location of the LSF file. :param error_algorithm_flag: [optional] Choice of algorithm to compute error bars (default: 1, as per FERRE). This corresponds to the FERRE keyword `errbar`. 0. To adopt the distance from the solution at which $\chi^2$ = min($\chi^2$) + 1 1. To invert the curvature matrix 2. Perform numerical experiments injecting noise into the data :param wavelength_interpolation_flag: [optional] Flag to indicate what to do about wavelength interpolation (default: 0). This is not usually needed as the FERRE grids are computed on the resampled APOGEE grid. This corresponds to the FERRE keyword `winter`. 0. No interpolation. 1. Interpolate observations. 2. The FERRE documentation says 'Interpolate fluxes', but it is not clear to the writer how that is any different from Option 1. :param optimization_algorithm_flag: [optional] Integer flag to indicate which optimization algorithm to use: 1. Nelder-Mead 2. Boender-Timmer-Rinnoy Kan 3. Powell's truncated Newton method 4. Nash's truncated Newton method :param continuum_flag: [optional] Choice of algorithm to use for continuum fitting (default: 1). This corresponds to the FERRE keyword `cont`, and is related to the FERRE keywords `ncont` and `rejectcont`. If `None` is supplied then no continuum keywords will be given to FERRE. 1. Polynomial fitting using an iterative sigma clipping algrithm (set by `continuum_order` and `continuum_reject` keywords). 2. Segmented normalization, where the data are split into `continuum_segment` segments, and the values in each are divided by their mean values. 3. The input data are divided by a running mean computed with a window of `continuum_segment` pixels. :param continuum_order: [optional] The order of polynomial fitting to use, if `continuum_flag` is 1. This corresponds to the FERRE keyword `ncont`, if `continuum_flag` is 1. If `continuum_flag` is not 1, this keyword argument is ignored. :param continuum_segment: [optional] Either the number of segments to split the data into for performing normalization, (e.g., when `continuum_flag` = 2), or the window size to use when `continuum_flag` = 3. This corresponds to the FERRE keyword `ncont` if `continuum_flag` is 2 or 3. If `continuum_flag` is not 2 or 3, this keyword argument is ignored. :param continuum_reject: [optional] When using polynomial fitting with an iterative sigma clipping algorithm (`continuum_flag` = 1), this sets the relative error where data points will be excluded. Any data points with relative errors larger than `continuum_reject` will be excluded. This corresponds to the FERRE keyword `rejectcont`. If `continuum_flag` is not 1, this keyword argument is ignored. :param continuum_observations_flag: [optional] This corresponds to the FERRE keyword `obscont`. Nothing is written down in the FERRE documentation about this keyword. :param full_covariance: [optional] Return the full covariance matrix from FERRE (default: True). This corresponds to the FERRE keyword `covprint`. :param pca_project: [optional] Use Principal Component Analysis to compress the spectra (default: False). This corresponds to the FERRE keyword `pcaproject`. :param pca_chi: [optional] Use Principal Component Analysis to compress the spectra when calculating the $\chi^2$ statistic. This corresponds to the FERRE keyword `pcachi`. :param n_threads: [optional] The number of threads to use for FERRE. This corresponds to the FERRE keyword `nthreads`. :param f_access: [optional] If `False`, load the entire grid into memory. If `True`, run the interpolation without loading the entire grid into memory -- this is useful for small numbers of interpolation. If `None` (default), automatically determine which is faster. This corresponds to the FERRE keyword `f_access`. :param f_format: [optional] File format of the FERRE grid: 0 (ASCII) or 1 (UNF format, default). This corresponds to the FERRE keyword `f_format`. :param ferre_kwargs: [optional] A dictionary of options to apply directly to FERRE, which will over-ride other settings supplied here, so use with caution. """ # Create the temporary directory, if necessary. if directory is None: directory = mkdtemp(**kwargs.get("directory_kwds", {})) log.info(f"Created temporary directory {directory}") os.makedirs(directory, exist_ok=True) # Create a dictionary of all input keywords. input_kwds = {} for arg in getfullargspec(ferre).args: input_kwds[arg] = locals()[arg] wavelength, flux, sigma, mask, names, initial_parameters, kwds, meta = prepare_ferre(directory, input_kwds) execute_args = (directory, len(flux), kwds["offile"]) if slurm_kwds: stdout, stderr = _execute_ferre_by_slurm(*execute_args, **slurm_kwds) else: stdout, stderr = _execute_ferre_by_subprocess(*execute_args) return parse_ferre_outputs( directory, header_path, wavelength, flux, sigma, mask, names, initial_parameters, kwds, meta, clean_up_on_exit=clean_up_on_exit, raise_exception_on_bad_outputs=raise_exception_on_bad_outputs )
def _select_training_set_data_from_database(label_columns, filter_args=None, filter_func=None, limit=None, **kwargs): label_columns = list(label_columns) label_names = [column.key for column in label_columns] L = len(label_names) if filter_func is None: filter_func = lambda *_, **__: True # Get the label names. log.info(f"Querying for label names {label_names} from {label_columns}") # Figure out what other columns we will need to identify the input file. for column in label_columns: try: primary_parent = column.class_ except AttributeError: continue else: break else: raise ValueError( "Can't get primary parent. are you labelling every column?") log.debug(f"Identified primary parent table as {primary_parent}") if primary_parent == catalogdb.SDSSApogeeAllStarMergeR13: log.debug( f"Adding columns and setting data_model_func for {primary_parent}") additional_columns = [ catalogdb.SDSSDR16ApogeeStar.apstar_version.label("apstar"), catalogdb.SDSSDR16ApogeeStar.field, catalogdb.SDSSDR16ApogeeStar.apogee_id.label("obj"), catalogdb.SDSSDR16ApogeeStar.file, catalogdb.SDSSDR16ApogeeStar.telescope, # Things that we might want for filtering on. catalogdb.SDSSDR16ApogeeStar.snr ] columns = label_columns + additional_columns q = session.query(*columns).join( catalogdb.SDSSApogeeAllStarMergeR13, func.trim(catalogdb.SDSSApogeeAllStarMergeR13.apstar_ids) == catalogdb.SDSSDR16ApogeeStar.apstar_id) data_model_func = lambda apstar, field, obj, filename, telescope, *_, : { "release": "DR16", "filetype": "apStar", "apstar": apstar, "field": field, "obj": obj, "prefix": filename[:2], "telescope": telescope, "apred": filename.split("-")[1] } else: raise NotImplementedError( f"Cannot intelligently figure out what data model keywords will be necessary." ) if filter_args is not None: q = q.filter(*filter_args) if limit is not None: q = q.limit(limit) log.debug(f"Querying {q}") data_model_identifiers = [] labels = {label_name: [] for label_name in label_names} for i, row in enumerate(tqdm(q.yield_per(1), total=q.count())): if not filter_func(*row): continue for label_name, value in zip(label_names, row[:L]): if not np.isfinite(value) or value is None: log.warning( f"Label {label_name} in {i} row is not finite: {value}!") labels[label_name].append(value) data_model_identifiers.append(data_model_func(*row[L:])) return (labels, data_model_identifiers)
def estimate_stellar_labels(pks, **kwargs): """ Estimate stellar labels given a single-layer neural network. :param pks: The primary keys of task instances to estimate stellar labels for. The task instances include information to identify the source SDSS data product. """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") log.info(f"Running ThePayne on device {device} with:") log.info( f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'") log.info(f"Using torch version {torch.__version__} in {torch.__path__}") states = {} log.info(f"Estimating stellar labels for task instances") results = {} for instance, path, spectrum in prepare_data(pks): if spectrum is None: continue model_path = instance.parameters["model_path"] try: state = states[model_path] except KeyError: log.info(f"Loading model from {model_path}") state = states[model_path] = test.load_state(model_path) label_names = state["label_names"] L = len(label_names) log.info(f"Estimating these {L} label names: {label_names}") # Run optimization. t_init = time() p_opt, p_cov, model_flux, meta = test.test(spectrum.wavelength.value, spectrum.flux.value, spectrum.uncertainty.array, **state) t_opt = time() - t_init #log.debug(f"spectrum shape: {spectrum.flux.shape}") #log.debug(f"p_opt shape: {p_opt.shape}") #log.debug(f"spectrum meta: {spectrum.meta['snr']}") # Prepare outputs. result = dict(zip(label_names, p_opt.T)) result.update(snr=spectrum.meta["snr"]) # Include uncertainties. result.update( dict( zip((f"u_{ln}" for ln in label_names), np.sqrt(p_cov[:, np.arange(p_opt.shape[1]), np.arange(p_opt.shape[1])].T)))) results[instance.pk] = result log.info(f"Result for {instance} took {t_opt} seconds") # Write database outputs. for pk, result in tqdm(results.items(), desc="Writing database outputs"): # Write database outputs. create_task_output(pk, astradb.ThePayne, **result) return None
def estimate_stellar_labels(pks, model_path, dwave_slam=10., p_slam=(1E-8, 1E-7), q_slam=0.7, ivar_block_slam=None, eps_slam=1E-19, rsv_frac_slam=2., n_jobs_slam=1, verbose_slam=5): """ Estimate the stellar parameters for APOGEE ApStar observations, where task instances have been created with the given primary keys (`pks`). :param pks: The primary keys of task instances that include information about what ApStar observation to load. :param model_path: The disk path of the pre-trained model. :param dwave_slam: float binning width :param p_slam: tuple of 2 ps [optional] smoothing parameter between 0 and 1: (default: 1E-8, 1E-7) 0 -> LS-straight line 1 -> cubic spline interpolant :param q_slam: float in range of [0, 100] [optional] percentile, between 0 and 1 (default: 0.7) :param ivar_block_slam: ndarray (n_pix, ) | None [optional] ivar array (default: None) :param eps_slam: float [optional] the ivar threshold (default: 1E-19) :param rsv_frac_slam: float [optional] the fraction of pixels reserved in terms of std. default is 3. :param n_jobs_slam: int [optional] number of processes launched by joblib (default: 1) :param verbose_slam: int / bool [optional] verbose level (default: 5) """ ''' device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") log.info(f"Running APOGEENet on device {device} with:") log.info(f"\tmodel_path: {model_path}") log.info(f"\tpks: {pks}") log.debug(f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'") log.debug(f"Using torch version {torch.__version__} in {torch.__path__}") # Load the model. ### model = Model(model_path, device) ''' # Load the model. model = Slam.load_dump(model_path) ### ("./models/btsettl.dump") ### wave_interp = np.load("./models/wave_interp_R1800.npz")['wave'] ### ??? how to load properly wave_interp = model.wave log.info(f"Loaded model from {model_path}") pks = deserialize_pks(pks, flatten=True) total = len(pks) log.info(f"There are {total} primary keys to process: {pks}") for instance, path, spectrum in tqdm(prepare_data(pks), total=total): if spectrum is None: continue N, P = spectrum.flux.shape ''' ### original code in apogeenet flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape((N, 1, P)) ### original code in MDwarfMachine fluxes, invars = [], [] for i in tqdm(range(len(obs_spec))): fluxes += [obs_spec[i]['flux_resamp']] invars += [obs_spec[i]['invar_resamp']] fluxes, invars = np.array(fluxes), np.array(invars) ''' ### wave = np.nan_to_num(spectrum.spectral_axis.value).astype(np.float32).reshape((N, 1, P)) ### fluxes = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape((N, 1, P)) ### ??? reshape to what format ### invars = np.nan_to_num(spectrum.uncertainty.array).astype(np.float32).reshape((N, 1, P)) ### ??? spectrum.uncertainity format wave = spectrum.spectral_axis fluxes = spectrum.flux invars = specrrum.uncertainty fluxes_resamp, invars_resamp = [], [] for i in tqdm(range(N)): fluxes_temp, invars_temp = resample(wave[i], fluxes[i], invars[i], wave_interp) fluxes_resamp += [fluxes_temp] invars_resamp += [invars_temp] fluxes_resamp, invars_resamp = np.array(fluxes_resamp), np.array( invars_resamp) ### normalization of each spetra ### fluxes_norm, fluxes_cont = normalize_spectra_block(wave_interp, fluxes_resamp, ### (6147., 8910.), 10., p=(1E-8, 1E-7), q=0.7, ### eps=1E-19, rsv_frac=2., n_jobs=1, verbose=5) ### ??? inputs fluxes_norm, fluxes_cont = normalize_spectra_block( wave_interp, fluxes_resamp, (6147., 8910.), dwave_slam, p=p_slam, q=q_slam, ivar_block=ivar_block_slam, eps=eps_slam, rsv_frac=rsv_frac_slam, n_jobs=n_jobs_slam, verbose=verbose_slam) invars_norm = fluxes_cont**2 * invars_resamp ### Initial estimation: get initial estimate of parameters by chi2 best match label_init = model.predict_labels_quick(fluxes_norm, invars_norm, n_jobs=1) ### SLAM prediction: optimize parameters results_pred = model.predict_labels_multi(label_init, fluxes_norm, invars_norm) label_pred = np.array([label['x'] for label in results_pred]) std_pred = np.array([label['pstd'] for label in results_pred]) ### modify the following block for SLAM style # Create results array. ### log_g, log_teff, fe_h = predictions.T ### teff = 10**log_teff teff = label_pred[:, 0] m_h = label_pred[:, 1] log_g = label_pred[:, 2] alpha_m = label_pred[:, 3] u_teff = std_pred[:, 0] u_m_h = std_pred[:, 1] u_log_g = std_pred[:, 2] u_alpha_m = std_pred[:, 3] result = dict( snr=spectrum.meta["snr"], teff=teff.tolist(), m_h=m_h.tolist(), logg=log_g.tolist(), alpha_m=alpha_m.tolist(), u_teff=u_teff.tolist(), u_m_h=u_m_h.tolist(), u_logg=u_log_g.tolist(), u_alpha_m=u_alpha_m.tolist(), ) # Write the result to database. ### create_task_output(instance, astradb.ApogeeNet, **result) create_task_output(instance, astradb.SLAM, **result) log.info(f"Completed processing of {total} primary keys")
training_labels, training_spectra, \ validation_labels, validation_spectra = training.load_training_data(training_set_path) state, model, optimizer = training.train(training_spectra, training_labels, validation_spectra, validation_labels, label_names, num_neurons=int(num_neurons), num_epochs=num_epochs, learning_rate=learning_rate, weight_decay=weight_decay) # Ensure that the output folder exists. os.makedirs(os.path.dirname(output_model_path), exist_ok=True) log.info(f"Writing model to {output_model_path}") with open(output_model_path, "wb") as fp: pickle.dump( dict( state=state, wavelength=wavelength, label_names=label_names, ), fp) # Try to send xcom result of the output path. try: ti = kwargs["ti"] ti.xcom_push("model_path", output_model_path) except: log.exception("Unable to send `model_path` as xcom variable") else:
def execute(self, context): """ Execute the operator. :param context: The Airflow DAG context. """ # Load spectra. instances, Ns = ([], []) wavelength, flux, sigma, spectrum_meta = ([], [], [], []) for instance, path, spectrum in self.prepare_data(): if spectrum is None: continue N, P = spectrum.flux.shape wavelength.append( np.tile(spectrum.wavelength.value, N).reshape((N, -1))) flux.append(spectrum.flux.value) sigma.append(spectrum.uncertainty.array**-0.5) spectrum_meta.append(dict(snr=spectrum.meta["snr"])) Ns.append(N) instances.append(instance) Ns = np.array(Ns, dtype=int) wavelength, flux, sigma = tuple( map(np.vstack, (wavelength, flux, sigma))) # Create names for easy debugging in FERRE outputs. names = create_names( instances, Ns, "{star_index}_{telescope}_{obj}_{spectrum_index}") # Load initial parameters, taking account initial_parameters = create_initial_parameters(instances, Ns) # Directory. directory = os.path.join( get_base_output_path(), "ferre", "tasks", f"{context['ds']}-{context['dag'].dag_id}-{context['task'].task_id}-{context['run_id']}" ) os.makedirs(directory, exist_ok=True) log.info(f"Working directory for task is {directory}") # Prepare FERRE. args = prepare_ferre( directory, dict(wavelength=wavelength, flux=flux, sigma=sigma, header_path=self.header_path, names=names, initial_parameters=initial_parameters, frozen_parameters=self.frozen_parameters, interpolation_order=self.interpolation_order, input_weights_path=self.input_weights_path, input_lsf_shape_path=self.input_lsf_shape_path, lsf_shape_flag=self.lsf_shape_flag, error_algorithm_flag=self.error_algorithm_flag, wavelength_interpolation_flag=self. wavelength_interpolation_flag, optimization_algorithm_flag=self.optimization_algorithm_flag, continuum_flag=self.continuum_flag, continuum_order=self.continuum_order, continuum_segment=self.continuum_segment, continuum_reject=self.continuum_reject, continuum_observations_flag=self.continuum_observations_flag, full_covariance=self.full_covariance, pca_project=self.pca_project, pca_chi=self.pca_chi, n_threads=self.n_threads, f_access=self.f_access, f_format=self.f_format, ferre_kwargs=self.ferre_kwargs)) # Execute, either by slurm or whatever. log.debug(f"FERRE ready to roll in {directory}") assert self.slurm_kwargs self.execute_by_slurm( context, bash_command= "/uufs/chpc.utah.edu/common/home/sdss09/software/apogee/Linux/apogee/trunk/bin/ferre.x", directory=directory, ) # Unbelievably, FERRE sends a '1' exit code every time it is executed. Even if it succeeds. # TODO: Ask Carlos or Jon to remove this insanity. # Parse outputs. # TODO: clean up this function param, param_err, output_meta = parse_ferre_outputs( directory, self.header_path, *args) results = group_results_by_instance(param, param_err, output_meta, spectrum_meta, Ns) for instance, (result, data) in zip(instances, results): if result is None: continue create_task_output(instance, astradb.Ferre, **result) log.debug(f"{instance}") log.debug(f"{result}") log.debug(f"{data}") # TODO: Write a data model product for this intermediate output! output_path = utils.output_data_product_path(instance.pk) os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "wb") as fp: pickle.dump((result, data), fp) log.info( f"Wrote outputs of task instance {instance} to {output_path}") # Always return the primary keys that were worked on! return self.pks
def _parse_names_and_initial_and_frozen_parameters( names, initial_parameters, frozen_parameters, headers, flux, clip_initial_parameters_to_boundary_edges=True, clip_epsilon_percent=1, **kwargs): # Read the labels from the first header path parameter_names = headers["LABEL"] # Need the number of spectra, which we will take from the flux array. N = len(flux) mid_point = _grid_mid_point(headers) parsed_initial_parameters = np.tile(mid_point, N).reshape((N, -1)) log.debug(f"parsed initial parameters before {parsed_initial_parameters}") compare_parameter_names = list( map(sanitise_parameter_name, parameter_names)) log.debug(f"Initial parameters passed for parsing {initial_parameters}") if initial_parameters is not None: log.debug(f"Comparison names {compare_parameter_names}") for i, (parameter_name, values) in enumerate(initial_parameters.items()): spn = sanitise_parameter_name(parameter_name) log.debug(f"{parameter_name} {values} {spn}") try: index = compare_parameter_names.index(spn) except ValueError: log.warning( f"Ignoring initial parameters for {parameter_name} as they are not in {parameter_names}" ) log.debug( f"Nothing matched for {spn} {parameter_name} {compare_parameter_names}" ) else: log.debug(f"Matched to index {index}") # Replace non-finite values with the mid point. finite = np.isfinite(values) if not np.all(finite): log.warning( f"Missing or non-finite initial values given for {parameter_name}. Defaulting to the grid mid-point." ) values = np.array(values) values[~finite] = mid_point[index] log.debug(f"values are {values} {type(values[0])} {finite}") parsed_initial_parameters[:, index] = values log.debug(f"parsed initial parameters after {parsed_initial_parameters}") kwds = dict() frozen_parameters = (frozen_parameters or dict()) if frozen_parameters: # Ensure we have a dict-like thing. if isinstance(frozen_parameters, (list, tuple, np.ndarray)): frozen_parameters = { sanitise_parameter_name(k): True for k in frozen_parameters } elif isinstance(frozen_parameters, dict): # Exclude things that have boolean False. frozen_parameters = { sanitise_parameter_name(k): v for k, v in frozen_parameters.items() \ if not (isinstance(v, bool) and not v) } else: raise TypeError( f"frozen_parameters must be list-like or dict-like") unknown_parameters = set(frozen_parameters).difference( compare_parameter_names) if unknown_parameters: raise ValueError( f"unknown parameter(s): {unknown_parameters} (available: {parameter_names})" ) indices = [ i for i, pn in enumerate(compare_parameter_names, start=1) if pn not in frozen_parameters ] if len(indices) == 0: raise ValueError(f"all parameters frozen?!") # Over-ride initial values with the frozen ones if given. for parameter_name, value in frozen_parameters.items(): if not isinstance(value, bool): log.debug( f"Over-writing initial values for {parameter_name} with frozen value of {value}" ) zero_index = compare_parameter_names.index(parameter_name) parsed_initial_parameters[:, zero_index] = value else: # No frozen parameters. indices = 1 + np.arange(len(parameter_names), dtype=int) # Build a frozen parameters dict for result metadata. parsed_frozen_parameters = { pn: (pn in frozen_parameters) for pn in compare_parameter_names } L = len(indices) kwds.update( ndim=headers["N_OF_DIM"], nov=L, indv=" ".join([f"{i:.0f}" for i in indices]), # We will always provide an initial guess, even if it is the grid mid point. init=0, indini=" ".join(["1"] * L)) # Now deal with names. if names is None: names = [f"{i:.0f}" for i in range(len(parsed_initial_parameters))] else: if len(names) != len(parsed_initial_parameters): raise ValueError( f"names and initial parameters does not match ({len(names)} != {len(parsed_initial_parameters)})" ) # Let's check the initial values are all within the grid boundaries. lower_limit, upper_limit = _get_grid_limits(headers) try: _check_initial_parameters_within_grid_limits(parsed_initial_parameters, lower_limit, upper_limit, parameter_names) except ValueError as e: log.exception( f"Exception when checking initial parameters within grid boundaries:" ) log.critical(e, exc_info=True) if clip_initial_parameters_to_boundary_edges: log.info( f"Clipping initial parameters to boundary edges (use clip_initial_parameters_to_boundary_edges=False to raise exception instead)" ) clip = clip_epsilon_percent * (upper_limit - lower_limit) / 100. parsed_initial_parameters = np.round( np.clip(parsed_initial_parameters, lower_limit + clip, upper_limit - clip), 3) else: raise return (kwds, names, parsed_initial_parameters, parsed_frozen_parameters)
def train(self, threads=None, op_method=None, op_strict=True, op_kwds=None, **kwargs): """ Train the model. :param threads: [optional] The number of parallel threads to use. :param op_method: [optional] The optimization algorithm to use: l_bfgs_b (default) and powell are available. :param op_strict: [optional] Default to Powell's optimization method if BFGS fails. :param op_kwds: Keyword arguments to provide directly to the optimization function. :returns: A three-length tuple containing the spectral coefficients `theta`, the squared scatter term at each pixel `s2`, and metadata related to the training of each pixel. """ kwds = dict(op_method=op_method, op_strict=op_strict, op_kwds=op_kwds) kwds.update(kwargs) if self.training_set_flux is None or self.training_set_ivar is None: raise TypeError( "cannot train: training set spectra not saved with the model") S, P = self.training_set_flux.shape T = self.design_matrix.shape[1] log.info("Training {0}-label {1} with {2} stars and {3} pixels/star"\ .format(len(self.vectorizer.label_names), type(self).__name__, S, P)) # Parallelise out. if threads in (1, None): mapper, pool = (map, None) else: pool = mp.Pool(threads) mapper = pool.map func = Wrapper(fitting.fit_pixel_fixed_scatter, None, kwds, P) meta = [] theta = np.nan * np.ones((P, T)) s2 = np.nan * np.ones(P) for pixel, (flux, ivar) \ in enumerate(zip(self.training_set_flux.T, self.training_set_ivar.T)): args = (flux, ivar, self._initial_theta(pixel), self._censored_design_matrix(pixel), self._pixel_access(self.regularization, pixel, 0.0), None) (pixel_theta, pixel_s2, pixel_meta), = mapper(func, [args]) meta.append(pixel_meta) theta[pixel], s2[pixel] = (pixel_theta, pixel_s2) self._theta, self._s2 = (theta, s2) if pool is not None: pool.close() pool.join() return (theta, s2, meta)
upstream_pk = instance.parameters.get("upstream_pk", None) if upstream_pk is None: raise ValueError(f"cannot do median filter correction because no upstream_pk parameter for {instance}") upstream_pk = literal_eval(upstream_pk) # There could be many upstream tasks listed, so we should get the matching one. q = session.query(astradb.TaskInstance)\ .filter(astradb.TaskInstance.pk.in_(upstream_pk))\ .filter(astradb.TaskInstance.task_id.like(median_filter_correction_from_task_id_like)) upstream_instance = q.one_or_none() if upstream_instance is None: raise RuntimeError(f"cannot find upstream instance in {upstream_pk} matching {median_filter_correction_from_task_id_like}") log.info(f"Applying median filtered correction\n\tto {instance}\n\tfrom {upstream_instance}") upstream_path = utils.output_data_product_path(upstream_instance.pk) with open(upstream_path, "rb") as fp: result, data = pickle.load(fp) # Need number of pixels from header n_pixels = [header["NPIX"] for header in utils.read_ferre_headers(utils.expand_path(instance.parameters["header_path"]))][1:] # Get the segment indices using the data mask and the known number of pixels. indices = 1 + np.cumsum(data["mask"]).searchsorted(np.cumsum(n_pixels)) segment_indices = np.vstack([indices - n_pixels, indices]).T cont = median_filtered_correction( wavelength=data["wavelength"], # TODO: Check this median filtered correction.
def run(self): """ Execute this task. """ # Load the model. log.info(f"Loading model for {self}") state = testing.load_state(self.input()["model"].path) # We can run this in batch mode. label_names = state["label_names"] tqdm_kwds = dict(total=self.get_batch_size(), desc="The Payne") for init, task in tqdm(timer(self.get_batch_tasks()), **tqdm_kwds): if task.complete(): continue #log.debug(f"Running {task}") spectrum, continuum, normalized_flux, normalized_ivar = task.prepare_observation() #log.debug(f"Prepared observations for {task}") p_opt, p_cov, model_flux, meta = testing.test( spectrum.wavelength.value, normalized_flux, normalized_ivar, **state ) #log.debug(f"Completed inference on {task}. p_opt has shape {p_opt.shape}") results = dict(zip(label_names, p_opt.T)) # Note: we count the number of label names here in case we are sometimes using # radial velocity determination or not, before we add in the SNR. L = len(results) # Add in uncertainties on parameters. results.update(dict(zip( (f"u_{ln}" for ln in label_names), np.sqrt(p_cov[:, np.arange(L), np.arange(L)].T) ))) # Add in SNR values for conveninence. results.update(snr=spectrum.meta["snr"]) # Write AstraSource object. if "AstraSource" in task.output(): #log.debug(f"Writing AstraSource object for {task}") task.output()["AstraSource"].write( spectrum=spectrum, normalized_flux=normalized_flux, normalized_ivar=normalized_ivar, continuum=continuum, model_flux=model_flux, # TODO: Project uncertainties to flux space. model_ivar=None, results_table=Table(results) ) # Write output to database. if "database" in task.output(): #log.debug(f"Writing database output for {task}") task.output()["database"].write(results) # Trigger this event as complete, and record task duration. task.trigger_event_processing_time(time() - init, cascade=True) return None
def export_to_table(output_path, overwrite=True): """ Export the APOGEENet database results to a table. :param output_path: The disk location where to write the table to. :param overwrite: [optional] Overwrite any """ output_path = os.path.expandvars(os.path.expanduser(output_path)) if not overwrite and os.path.exists(output_path): raise OSError(f"path '{output_path}' already exists and asked not to overwrite it") sq = session.query( astradb.ApogeeNet.output_pk.label("output_pk"), func.json_object_agg( astradb.Parameter.parameter_name, astradb.Parameter.parameter_value ).label("parameters") )\ .filter(astradb.ApogeeNet.output_pk == astradb.TaskInstance.output_pk)\ .filter(astradb.TaskInstance.pk == astradb.TaskInstanceParameter.ti_pk)\ .filter(astradb.TaskInstanceParameter.parameter_pk == astradb.Parameter.pk)\ .group_by(astradb.ApogeeNet)\ .subquery(with_labels=True) q = session.query( astradb.TaskInstance, astradb.ApogeeNet, func.cardinality(astradb.ApogeeNet.snr), sq.c.parameters )\ .filter(sq.c.output_pk == astradb.ApogeeNet.output_pk)\ .filter(sq.c.output_pk == astradb.TaskInstance.output_pk) total, = session.query(func.sum(func.cardinality(astradb.ApogeeNet.snr))).first() table_columns = OrderedDict([ ("ti_pk", []), ("run_id", []), ("release", []), ("apred", []), ("field", []), ("healpix", []), ("telescope", []), ("obj", []), ("spectrum_index", []), ]) column_names = ("snr", "teff", "u_teff", "logg", "u_logg", "fe_h", "u_fe_h", "bitmask_flag") for cn in column_names: table_columns[cn] = [] with tqdm(total=total, unit="spectra") as pb: for task_instance, result, N, parameters in q.yield_per(1): for i in range(N): table_columns["ti_pk"].append(result.ti_pk) table_columns["run_id"].append(task_instance.run_id) table_columns["release"].append(parameters["release"]) table_columns["apred"].append(parameters["apred"]) table_columns["field"].append(parameters.get("field", "")) table_columns["healpix"].append(parameters.get("healpix", "")) table_columns["telescope"].append(parameters["telescope"]) table_columns["obj"].append(parameters["obj"]) table_columns["spectrum_index"].append(i) for column_name in column_names: table_columns[column_name].append(getattr(result, column_name)[i]) pb.update(1) log.info(f"Creating table with {total} rows") table = Table(data=table_columns) log.info(f"Table created.") log.info(f"Writing to {output_path}") table.write(output_path, overwrite=overwrite) log.info("Done") return table_columns