def __query_sdss5_data_model_identifiers_from_database( self, mjd_start, mjd_end): """ Query the SDSS-V database for ApStar data model identifiers. """ release, filetype, apstar = ("sdss5", "apStar", "stars") columns = ( apogee_drpdb.Star.apred_vers.label( "apred"), # TODO: Raise with Nidever apogee_drpdb.Star.healpix, apogee_drpdb.Star.telescope, apogee_drpdb.Star.apogee_id.label( "obj"), # TODO: Raise with Nidever ) if not self._skip_sources_with_more_recent_observations: q = session.query(*columns).distinct(*columns) else: # Get the max MJD of any observations for this source. sq = session.query( *columns, func.max( apogee_drpdb.Star.mjdend).label('max_mjdend')).group_by( *columns).subquery() q = session.query(*columns).join( sq, and_(apogee_drpdb.Star.mjdend == sq.c.max_mjdend, apogee_drpdb.Star.apred_vers == sq.c.apred, apogee_drpdb.Star.healpix == sq.c.healpix, apogee_drpdb.Star.telescope == sq.c.telescope, apogee_drpdb.Star.apogee_id == sq.c.obj)) # Filter on number of good RV measurements, and the MJD of last obs q = q.filter(apogee_drpdb.Star.mjdend < mjd_end)\ .filter(apogee_drpdb.Star.mjdend >= mjd_start) if self._query_filter_by_kwargs is not None: q = q.filter_by(**self._query_filter_by_kwargs) if self._limit is not None: q = q.limit(self._limit) log.debug(f"Preparing query {q}") total = q.count() log.debug( f"Retrieved {total} rows between {mjd_start} <= MJD < {mjd_end}") keys = [column.name for column in columns] for values in q.yield_per(1): d = dict(zip(keys, values)) d.update( release=release, filetype=filetype, apstar=apstar, ) yield d
def add_task_instance_parameter(task_instance, key, value): parameter_pk, created = get_or_create_parameter_pk(key, value) with session.begin(): # TODO: Check if the task instance already has this parameter. session.add( astradb.TaskInstanceParameter(ti_pk=task_instance.pk, parameter_pk=parameter_pk)) log.debug( f"Added key/value pair {key}: {value} to task instance {task_instance}" ) return parameter_pk
def del_task_instance_parameter(task_instance, key): try: value = task_instance.parameters[key] except KeyError: # That key isn't in there! None else: # Get the PK. parameter_pk, _ = get_or_create_parameter_pk(key, value) # Get the TI/PK q = session.query(astradb.TaskInstanceParameter).filter( astradb.TaskInstanceParameter.ti_pk == task_instance.pk, astradb.TaskInstanceParameter.parameter_pk == parameter_pk).one_or_none() session.delete(q) log.debug( f"Removed key/value pair {key}: {value} from task instance {task_instance}" ) assert key not in task_instance.parameters return True
def query_data_model_identifiers_from_database(self, context): release, filetype, apstar = ("sdss5", "apStar", "stars") prev_ds, ds = (context["prev_ds"], context["ds"]) # TODO: Assuming we are only using SDSS-V data here. columns = ( apogee_drpdb.Star.apred_vers.label( "apred"), # TODO: Raise with Nidever apogee_drpdb.Star.healpix, apogee_drpdb.Star.telescope, apogee_drpdb.Star.apogee_id.label( "obj"), # TODO: Raise with Nidever ) q = session.query(*columns).filter( apogee_drpdb.Star.created.between(prev_ds, ds)) if self._query_filter_by_kwargs is not None: q = q.filter_by(**self._query_filter_by_kwargs) if self._limit is not None: q = q.limit(self._limit) log.debug(f"Preparing query {q}") total = q.count() log.debug(f"Retrieved {total} rows between {prev_ds} and {ds}") keys = [column.name for column in columns] for values in q.yield_per(1): d = dict(zip(keys, values)) d.update( release=release, filetype=filetype, apstar=apstar, ) yield d
def _init_progressbar(self, N, message=None): """ Initialise a progressbar. :param N: The number of items that will be iterated over. :param message: [optional] An information message to log before showing the progressbar. """ self.N = int(N) if self.N < 0: return try: rows, columns = os.popen('stty size', 'r').read().split() except: log.debug("Couldn't get screen size. Progressbar may look odd.") self.W = 100 else: self.W = min(100, int(columns) - (12 + 21 + 2 * len(str(self.N)))) self.t_init = time() self.message = message if 0 >= self.N: return None if message is not None: log.info(message.rstrip()) sys.stdout.flush() with _counter_lock: _counter.value = 0
def query_data_model_identifiers_from_database(self, context): """ Query the SDSS-V database for BOSS spectrum data model identifiers. :param context: The Airflow DAG execution context. """ release, filetype = ("SDSS5", "spec") mjd_start = parse_as_mjd(context["prev_ds"]) mjd_end = parse_as_mjd(context["ds"]) columns = ( catalogdb.SDSSVBossSpall.catalogid, catalogdb.SDSSVBossSpall.run2d, catalogdb.SDSSVBossSpall.plate, catalogdb.SDSSVBossSpall.mjd, catalogdb.SDSSVBossSpall.fiberid ) q = session.query(*columns).distinct(*columns) q = q.filter(catalogdb.SDSSVBossSpall.mjd >= mjd_start)\ .filter(catalogdb.SDSSVBossSpall.mjd < mjd_end) if self._query_filter_by_kwargs is not None: q = q.filter_by(**self._query_filter_by_kwargs) if self._limit is not None: q = q.limit(self._limit) log.debug(f"Found {q.count()} {release} {filetype} files between MJD {mjd_start} and {mjd_end}") common = dict(release=release, filetype=filetype) keys = [column.name for column in columns] for values in q.yield_per(1): yield { **common, **dict(zip(keys, values)) }
def query_sdss4_dr16_data_model_identifiers_from_database( self, mjd_start, mjd_end): """ Query the SDSS database for SDSS-IV (DR16) ApStar data model identifiers. :param context: The Airflow DAG execution context. """ release, filetype = ("DR16", "apStar") columns = ( func.left(catalogdb.SDSSDR16ApogeeStar.file, 2).label("prefix"), catalogdb.SDSSDR16ApogeeStar.field, catalogdb.SDSSDR16ApogeeStar.apstar_version.label("apstar"), catalogdb.SDSSDR16ApogeeStar.telescope, catalogdb.SDSSDR16ApogeeStar.apogee_id.label("obj"), func.right(func.left(catalogdb.SDSSDR16ApogeeStar.file, 10), 3).label("apred"), ) if not self._skip_sources_with_more_recent_observations: # The SDSSDR16ApogeeStar table does not have any MJD information. mjd = catalogdb.SDSSDR16ApogeeVisit.mjd q = session.query(*columns, mjd).distinct(*columns, mjd).join( catalogdb.SDSSDR16ApogeeVisit, catalogdb.SDSSDR16ApogeeVisit.apogee_id == catalogdb.SDSSDR16ApogeeStar.apogee_id) else: # Get the max MJD of any observations. sq = session.query( *columns, func.max( catalogdb.SDSSDR16ApogeeVisit.mjd).label('max_mjd')).join( catalogdb.SDSSDR16ApogeeVisit, catalogdb.SDSSDR16ApogeeVisit.apogee_id == catalogdb.SDSSDR16ApogeeStar.apogee_id).group_by( *columns).subquery() mjd = sq.c.max_mjd q = session.query(*columns, mjd).join( sq, catalogdb.SDSSDR16ApogeeStar.apogee_id == sq.c.obj) q = q.filter(mjd < mjd_end)\ .filter(mjd >= mjd_start) if self._query_filter_by_kwargs is not None: q = q.filter_by(**self._query_filter_by_kwargs) if self._limit is not None: q = q.limit(self._limit) log.debug( f"Found {q.count()} {release} {filetype} files between MJD {mjd_start} and {mjd_end}" ) common = dict(release=release, filetype=filetype) keys = [column.name for column in columns] # The MJD will not be included because len(keys) < len(values) and zip will only take the shorter of both. for values in q.yield_per(1): yield {**common, **dict(zip(keys, values))}
def train_polynomial_model(labels, data, order=2, regularization=0, threads=1): log.debug(f'Inputs are: ({type(labels)}) {labels}') log.debug(f'Data are: {data}') # labels could be in JSON format. if isinstance(labels, str): labels = json.loads(labels.replace("'", '"')) # TODO: use a general deserializer that fixes the single quote issues with json loading if isinstance(data, str) and os.path.exists(data): with open(data, "rb") as fp: data = pickle.load(fp) for key in ("dispersion", "wavelength"): try: dispersion = data[key] except KeyError: continue else: break else: raise ValueError(f"unable to find {key} in data") training_set_flux = data["normalized_flux"] training_set_ivar = data["normalized_ivar"] try: num_spectra = data["num_spectra"] except: log.debug( f"Keeping all items in training set; not checking for missing spectra." ) else: keep = (num_spectra == 1) if not all(keep): log.warning( f"Excluding {sum(~keep)} objects from the training set that had missing spectra" ) labels = {k: np.array(v)[keep] for k, v in labels.items()} training_set_flux = training_set_flux[keep] training_set_ivar = training_set_ivar[keep] # Set the vectorizer. vectorizer = tc.vectorizer.PolynomialVectorizer( labels.keys(), order=order, ) # Initiate model. model = tc.model.CannonModel(labels, training_set_flux, training_set_ivar, vectorizer=vectorizer, dispersion=dispersion, regularization=regularization) model.train(threads=threads) output_path = os.path.join(get_base_output_path(), "thecannon", "model.pkl") os.makedirs(os.path.dirname(output_path), exist_ok=True) log.info(f"Writing The Cannon model {model} to disk {output_path}") model.write(output_path, include_training_set_spectra=True, overwrite=True) return output_path
def branch(task_id_callable, task, ti, **kwargs): """ A function to branch specific downstream tasks, given the primary keys returned by the upstream tasks. :param task_id_callable: A Python callable that takes in as input the `header_path` and returns a task ID. :param task: The task being executed. This is supplied by the DAG context. :param ti: The task instance. This is supplied by the DAG context. :returns: A list of task IDs that should execute next. """ # Get primary keys from upstream tasks. pks = [] for upstream_task in task.upstream_list: pks.append(ti.xcom_pull(task_ids=upstream_task.task_id)) pks = flatten(pks) log.debug(f"Upstream primary keys: {pks}") log.debug(f"Downstream task IDs: {task.downstream_list}") # Get unique header paths for the primary keys given. # TODO: This query could fail if the number of primary keys provided # is yuuge. May consider changing this query. q = session.query(astradb.TaskInstanceParameter.ti_pk, astradb.Parameter.parameter_value)\ .join(astradb.TaskInstanceParameter, astradb.TaskInstanceParameter.parameter_pk == astradb.Parameter.pk)\ .filter(astradb.Parameter.parameter_name == "header_path")\ .filter(astradb.TaskInstanceParameter.ti_pk.in_(pks)) log.debug(f"Found:") downstream_task_ids = [] for pk, header_path in q.all(): log.debug(f"\t{pk}: {header_path}") telescope, lsf, spectral_type_desc = utils.task_id_parts(header_path) if telescope is None and lsf is None: # Special hack for BA grids, where telescope/lsf information cannot be found from header path. # TODO: Consider removing this hack entirely. This could be fixed by symbolicly linking the BA grids to locations # for each telescope/fibre combination. instance = session.query(astradb.TaskInstance)\ .filter(astradb.TaskInstance.pk == pk).one_or_none() tree = SDSSPath(release=instance.parameters["release"]) path = tree.full(**instance.parameters) header = getheader(path) downstream_task_ids.append( task_id_callable( header_path, # TODO: This is matching the telescope styling in utils.task_id_parts, but these should have a common place. telescope=instance.parameters["telescope"].upper()[:3], lsf=utils.get_lsf_grid_name(header["MEANFIB"]) ) ) else: downstream_task_ids.append(task_id_callable(header_path)) log.debug(f"\t\tadded {downstream_task_ids[-1]}") downstream_task_ids = sorted(set(downstream_task_ids)) log.debug(f"Downstream tasks to execute:") for task_id in downstream_task_ids: log.debug(f"\t{task_id}") return downstream_task_ids
def execute(self, context): """ Execute the operator. :param context: The Airflow DAG context. """ # Load spectra. instances, Ns = ([], []) wavelength, flux, sigma, spectrum_meta = ([], [], [], []) for instance, path, spectrum in self.prepare_data(): if spectrum is None: continue N, P = spectrum.flux.shape wavelength.append( np.tile(spectrum.wavelength.value, N).reshape((N, -1))) flux.append(spectrum.flux.value) sigma.append(spectrum.uncertainty.array**-0.5) spectrum_meta.append(dict(snr=spectrum.meta["snr"])) Ns.append(N) instances.append(instance) Ns = np.array(Ns, dtype=int) wavelength, flux, sigma = tuple( map(np.vstack, (wavelength, flux, sigma))) # Create names for easy debugging in FERRE outputs. names = create_names( instances, Ns, "{star_index}_{telescope}_{obj}_{spectrum_index}") # Load initial parameters, taking account initial_parameters = create_initial_parameters(instances, Ns) # Directory. directory = os.path.join( get_base_output_path(), "ferre", "tasks", f"{context['ds']}-{context['dag'].dag_id}-{context['task'].task_id}-{context['run_id']}" ) os.makedirs(directory, exist_ok=True) log.info(f"Working directory for task is {directory}") # Prepare FERRE. args = prepare_ferre( directory, dict(wavelength=wavelength, flux=flux, sigma=sigma, header_path=self.header_path, names=names, initial_parameters=initial_parameters, frozen_parameters=self.frozen_parameters, interpolation_order=self.interpolation_order, input_weights_path=self.input_weights_path, input_lsf_shape_path=self.input_lsf_shape_path, lsf_shape_flag=self.lsf_shape_flag, error_algorithm_flag=self.error_algorithm_flag, wavelength_interpolation_flag=self. wavelength_interpolation_flag, optimization_algorithm_flag=self.optimization_algorithm_flag, continuum_flag=self.continuum_flag, continuum_order=self.continuum_order, continuum_segment=self.continuum_segment, continuum_reject=self.continuum_reject, continuum_observations_flag=self.continuum_observations_flag, full_covariance=self.full_covariance, pca_project=self.pca_project, pca_chi=self.pca_chi, n_threads=self.n_threads, f_access=self.f_access, f_format=self.f_format, ferre_kwargs=self.ferre_kwargs)) # Execute, either by slurm or whatever. log.debug(f"FERRE ready to roll in {directory}") assert self.slurm_kwargs self.execute_by_slurm( context, bash_command= "/uufs/chpc.utah.edu/common/home/sdss09/software/apogee/Linux/apogee/trunk/bin/ferre.x", directory=directory, ) # Unbelievably, FERRE sends a '1' exit code every time it is executed. Even if it succeeds. # TODO: Ask Carlos or Jon to remove this insanity. # Parse outputs. # TODO: clean up this function param, param_err, output_meta = parse_ferre_outputs( directory, self.header_path, *args) results = group_results_by_instance(param, param_err, output_meta, spectrum_meta, Ns) for instance, (result, data) in zip(instances, results): if result is None: continue create_task_output(instance, astradb.Ferre, **result) log.debug(f"{instance}") log.debug(f"{result}") log.debug(f"{data}") # TODO: Write a data model product for this intermediate output! output_path = utils.output_data_product_path(instance.pk) os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "wb") as fp: pickle.dump((result, data), fp) log.info( f"Wrote outputs of task instance {instance} to {output_path}") # Always return the primary keys that were worked on! return self.pks
def data_model_identifiers(self, context): """ Yield data model identifiers from upstream that match this operator's header path. """ pks, task, ti = ([], context["task"], context["ti"]) while True: for upstream_task in task.upstream_list: log.debug(f"Considering {upstream_task}") if isinstance(upstream_task, BranchPythonOperator): # Jump over branch operators log.debug( f"Jumping over BranchPythonOperator {upstream_task}") task = upstream_task break log.debug( f"Using upstream results from {upstream_task} ({upstream_task.task_id}) and {ti}" ) these_pks = ti.xcom_pull(task_ids=upstream_task.task_id) if these_pks is not None: pks.extend(these_pks) else: break pks = flatten(pks) if not pks: # This can happen if the BA stellar parameters is executed (because we all all branches to be skipped), # but everything else was skipped. raise AirflowSkipException(f"No upstream primary keys identified.") log.debug(f"From pks: {pks}") log.debug(f"That also match {self.header_path}") # Restrict to primary keys that have the same header path. q = session.query(astradb.TaskInstanceParameter.ti_pk)\ .distinct(astradb.TaskInstanceParameter.ti_pk)\ .join(astradb.Parameter, astradb.TaskInstanceParameter.parameter_pk == astradb.Parameter.pk)\ .filter(astradb.Parameter.parameter_name == "header_path")\ .filter(astradb.Parameter.parameter_value == self.header_path)\ .filter(astradb.TaskInstanceParameter.ti_pk.in_(pks)) log.debug(f"Restricting to primary keys:") first_or_none = lambda item: None if item is None else item[0] callables = [ ("initial_teff", lambda i: first_or_none(i.output.teff)), ("initial_logg", lambda i: first_or_none(i.output.logg)), ("initial_metals", lambda i: first_or_none(i.output.metals)), ("initial_log10vdop", lambda i: first_or_none(i.output.log10vdop)), ("initial_o_mg_si_s_ca_ti", lambda i: first_or_none(i.output.o_mg_si_s_ca_ti)), ("initial_lgvsini", lambda i: first_or_none(i.output.lgvsini)), ("initial_c", lambda i: first_or_none(i.output.c)), ("initial_n", lambda i: first_or_none(i.output.n)), ] trees = {} for pk, in q.all(): q = session.query( astradb.TaskInstance).filter(astradb.TaskInstance.pk == pk) instance = q.one_or_none() log.debug(f"{instance} with {instance.output}") release = instance.parameters["release"] filetype = instance.parameters["filetype"] parameters = dict(release=release, filetype=filetype) tree = trees.get(release, None) if tree is None: tree = trees[release] = SDSSPath(release=release) for key in tree.lookup_keys(filetype): parameters[key] = instance.parameters[key] # What other information should we pass on? if instance.output is None: # Only pass on the data model identifiers, and any initial values. # Let everything else be specified in this operator for key, callable in callables: parameters[key] = instance.parameters[key] else: # There is an upstream FerreOperator. log.debug( f"Taking previous result in {pk} as initial result here") # Take final teff/logg/etc as the initial values for this task. # TODO: Query whether we should be taking first or none, because if # we are running all visits we may want to use individual visit # results from the previous iteration for key, callable in callables: parameters[key] = callable(instance) # Store upstream primary key as a parameter, too. # We could decide not to do this, but it makes it much easier to find # upstream tasks. parameters.setdefault("upstream_pk", []) if "upstream_pk" in instance.parameters: try: upstream_pk = literal_eval( instance.parameters["upstream_pk"]) parameters["upstream_pk"].extend(upstream_pk) except: log.exception( f"Cannot add upstream primary keys from {instance}: {instance.parameters['upstream_pk']}" ) parameters["upstream_pk"].append(pk) yield parameters
def prepare_ferre(directory, input_kwds): json_kwds = dict(indent=2, cls=NumpyEncoder) log.debug(f"Parameters supplied to FERRE:") log.debug(json.dumps((input_kwds["initial_parameters"], input_kwds["frozen_parameters"]), **json_kwds)) # Parse and validate parameters. wavelength, flux, sigma, mask, names, initial_parameters, kwds, meta = parsed_kwds = utils.parse_ferre_inputs(**input_kwds) log.debug(f"Parameters after parsing FERRE:") log.debug(f"Initial parameters: {json.dumps(initial_parameters, **json_kwds)}") log.debug(f"Keywords: {json.dumps(kwds, **json_kwds)}") log.debug(f"Meta: {json.dumps(meta, **json_kwds)}") log.debug(f"Names: {json.dumps(names, **json_kwds)}") # Write control file. with open(os.path.join(directory, "input.nml"), "w") as fp: fp.write(utils.format_ferre_control_keywords(kwds)) # Write data arrays. utils.write_data_file(flux[:, mask], os.path.join(directory, kwds["ffile"])) utils.write_data_file(sigma[:, mask], os.path.join(directory, kwds["erfile"])) # Write initial values. with open(os.path.join(directory, kwds["pfile"]), "w") as fp: for name, point in zip(names, initial_parameters): fp.write(utils.format_ferre_input_parameters(*point, name=name)) return parsed_kwds
def get_best_result(task, ti, **kwargs): """ When there are numerous FERRE tasks that are upstream, this function will return the primary keys of the task instances that gave the best result on a per-observation basis. """ # Get the PKs from upstream. pks = [] log.debug(f"Upstream tasks: {task.upstream_list}") for upstream_task in task.upstream_list: pks.append(ti.xcom_pull(task_ids=upstream_task.task_id)) pks = flatten(pks) log.debug(f"Getting best initial guess among primary keys {pks}") # Need to uniquely identify observations. param_bit_mask = bitmask.ParamBitMask() bad_grid_edge = (param_bit_mask.get_value("GRIDEDGE_WARN") | param_bit_mask.get_value("GRIDEDGE_BAD")) trees = {} best_tasks = {} for i, pk in enumerate(pks): q = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk==pk) instance = q.one_or_none() if instance.output is None: log.warning(f"No output found for task instance {instance}") continue p = instance.parameters # Check that the telescope is the same as what we expect from this task ID. # This is a bit of a hack. Let us explain. # The "BA" grid does not have a telescope/fiber model, so you can run LCO and APO # data through the initial-BA grid. And those outputs go to the "get_best_results" # for each of the APO and LCO tasks (e.g., this function). # If there is only APO data, then the LCO "get_best_result" will only have one # input: the BA results. Then it will erroneously think that's the best result # for that source. # It's hacky to put this logic in here. It should be in the DAG instead. Same # thing for parsing 'telescope' name in the DAG (eg 'APO') from 'apo25m'. this_telescope_short_name = p["telescope"][:3].upper() expected_telescope_short_name = task.task_id.split(".")[1] log.info(f"For instance {instance} we have {this_telescope_short_name} and {expected_telescope_short_name}") if this_telescope_short_name != expected_telescope_short_name: continue try: tree = trees[p["release"]] except KeyError: tree = trees[p["release"]] = SDSSPath(release=p["release"]) key = "_".join([ p['release'], p['filetype'], *[p[k] for k in tree.lookup_keys(p['filetype'])] ]) best_tasks.setdefault(key, (np.inf, None)) # TODO: Confirm that this is base10 log. This should also be 'log_reduced_chisq_fit', # according to the documentation. log_chisq_fit, *_ = instance.output.log_chisq_fit previous_teff, *_ = instance.output.teff bitmask_flag, *_ = instance.output.bitmask_flag log.debug(f"Result {instance} {instance.output} with log_chisq_fit = {log_chisq_fit} and {previous_teff} and {bitmask_flag}") # Note: If FERRE totally fails then it will assign -999 values to the log_chisq_fit. So we have to # check that the log_chisq_fit is actually sensible! # (Or we should only query task instances where the output is sensible!) if log_chisq_fit < 0: # TODO: This is a f*****g hack. log.debug(f"Skipping result for {instance} {instance.output} as log_chisq_fit = {log_chisq_fit}") continue parsed_header = utils.parse_header_path(p["header_path"]) # Penalise chi-sq in the same way they did for DR17. # See github.com/sdss/apogee/python/apogee/aspcap/aspcap.py#L658 if parsed_header["spectral_type"] == "GK" and previous_teff < 3900: log.debug(f"Increasing \chisq because spectral type GK") log_chisq_fit += np.log10(10) bitmask_flag_logg, bitmask_flag_teff = bitmask_flag[-2:] if bitmask_flag_logg & bad_grid_edge: log.debug(f"Increasing \chisq because logg flag is bad edge") log_chisq_fit += np.log10(5) if bitmask_flag_teff & bad_grid_edge: log.debug(f"Increasing \chisq because teff flag is bad edge") log_chisq_fit += np.log10(5) # Is this the best so far? if log_chisq_fit < best_tasks[key][0]: log.debug(f"Assigning this output to best task as {log_chisq_fit} < {best_tasks[key][0]}: {pk}") best_tasks[key] = (log_chisq_fit, pk) for key, (log_chisq_fit, pk) in best_tasks.items(): if pk is None: log.warning(f"No good task found for key {key}: ({log_chisq_fit}, {pk})") else: log.info(f"Best task for key {key} with \chi^2 of {log_chisq_fit:.2f} is primary key {pk}") if best_tasks: return [pk for (log_chisq_fit, pk) in best_tasks.values() if pk is not None] else: raise AirflowSkipException(f"no task outputs found from {len(pks)} primary keys")
def write_database_outputs( task, ti, run_id, element_from_task_id_callable=None, **kwargs ): """ Collate outputs from upstream FERRE executions and write them to an ASPCAP database table. :param task: This task, as given by the Airflow context dictionary. :param ti: This task instance, as given by the Airflow context dictionary. :param run_id: This run ID, as given by the Airflow context dictionary. :param element_from_task_id_callable: [optional] A Python callable that returns the chemical element, given a task ID. """ log.debug(f"Writing ASPCAP database outputs") pks = [] for upstream_task in task.upstream_list: pks.append(ti.xcom_pull(task_ids=upstream_task.task_id)) log.debug(f"Upstream primary keys: {pks}") # Group them together by source. instance_pks = [] for source_pks in list(zip(*pks)): # The one with the lowest primary key will be the stellar parameters. sp_pk, *abundance_pks = sorted(source_pks) sp_instance = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk == sp_pk).one_or_none() abundance_instances = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk.in_(abundance_pks)).all() # Get parameters that are in common to all instances. keep = {} for key, value in sp_instance.parameters.items(): for instance in abundance_instances: if instance.parameters[key] != value: break else: keep[key] = value # Create a task instance. instance = create_task_instance( dag_id=task.dag_id, task_id=task.task_id, run_id=run_id, parameters=keep ) # Create a partial results table. keys = ["snr"] label_names = ("teff", "logg", "metals", "log10vdop", "o_mg_si_s_ca_ti", "lgvsini", "c", "n") for key in label_names: keys.extend([key, f"u_{key}"]) results = dict([(key, getattr(sp_instance.output, key)) for key in keys]) # Now update with elemental abundance instances. for el_instance in abundance_instances: if element_from_task_id_callable is not None: element = element_from_task_id_callable(el_instance.task_id).lower() else: element = el_instance.task_id.split(".")[-1].lower() # Check what is not frozen. thawed_label_names = [] ignore = ("lgvsini", ) # Ignore situations where lgvsini was missing from grid and it screws up the task for key in label_names: if key not in ignore and not getattr(el_instance.output, f"frozen_{key}"): thawed_label_names.append(key) if len(thawed_label_names) > 1: log.warning(f"Multiple thawed label names for {element} {el_instance}: {thawed_label_names}") values = np.hstack([getattr(el_instance.output, ln) for ln in thawed_label_names]).tolist() u_values = np.hstack([getattr(el_instance.output, f"u_{ln}") for ln in thawed_label_names]).tolist() results.update({ f"{element}_h": values, f"u_{element}_h": u_values, }) # Include associated primary keys so we can reference back to original parameters, etc. results["associated_ti_pks"] = [sp_pk, *abundance_pks] log.debug(f"Results entry: {results}") # Create an entry in the output interface table. # (We will update this later with any elemental abundance results). # TODO: Should we link back to the original FERRE primary keys? output = create_task_output( instance, astradb.Aspcap, **results ) log.debug(f"Created output {output} for instance {instance}") instance_pks.append(instance.pk) return instance_pks
def execute(self, context): """ Create task instances for all the data model identifiers, which could include multiple task instances for each data model identifier set. :param context: The Airflow context dictionary. """ # Get header information. grid_info = utils.parse_grid_information(self.header_paths) args = (context["dag"].dag_id, context["task"].task_id, context["run_id"]) # Get parameters from the parent class initialisation that should also be stored. common_task_parameters = self.common_task_parameters() pks = [] trees = {} for data_model_identifiers in self.data_model_identifiers(context): parameters = { **common_task_parameters, **data_model_identifiers } release = parameters["release"] tree = trees.get(release, None) if tree is None: trees[release] = tree = SDSSPath(release=release) path = tree.full(**parameters) # Generate initial guess(es). initial_guesses = [] # From headers try: header = getheader(path) teff = safe_read_header(header, ("RV_TEFF", "RVTEFF")) logg = safe_read_header(header, ("RV_LOGG", "RVLOGG")) fe_h = safe_read_header(header, ("RV_FEH", "RVFEH")) # Get information relevant for matching initial guess and grids. initial_guesses.append(dict( telescope=parameters["telescope"], # important for LSF information mean_fiber=header["MEANFIB"], # important for LSF information teff=teff, logg=logg, metals=fe_h, )) except: log.exception(f"Unable to load relevant headers from path {path}") continue # Add any other initial guesses? From Gaia? etc? for initial_guess in initial_guesses: for header_path, _ in utils.yield_suitable_grids(grid_info, **initial_guess): parameters.update( header_path=header_path, initial_teff=np.round(initial_guess["teff"], 0), initial_logg=np.round(initial_guess["logg"], 3), initial_metals=np.round(initial_guess["metals"], 3), initial_log10vdop=np.round(utils.approximate_log10_microturbulence(initial_guess["logg"]), 3), initial_o_mg_si_s_ca_ti=0.0, initial_lgvsini=1.0, # :eyes: initial_c=0.0, initial_n=0.0, ) instance = create_task_instance(*args, parameters) pks.append(instance.pk) log.debug(f"Created {instance} with parameters {parameters}") if not pks: raise AirflowSkipException("No data model identifiers found for this time period.") return pks
def estimate_radial_velocity(pks, verbose=True, mcmc=False, figfile=None, cornername=None, retpmodels=False, plot=False, tweak=True, usepeak=False, maxvel=[-1000, 1000]): """ Estimate radial velocities for the sources that are identified by the task instances of the given primary keys. :param pks: The primary keys of task instances to estimate radial velocities for, which includes parameters to identify the source SDSS data model product. See `doppler.rv.fit` for more information on other keyword arguments. """ # TODO: Move this to astra/contrib import doppler log.info(f"Estimating radial velocities for {len(pks)} task instances") failures = [] for instance, path, spectrum in prepare_data(pks): if spectrum is None: continue log.debug(f"Running Doppler on {instance} from {path}") try: spectrum = doppler.read(path) summary, model_spectrum, modified_input_spectrum = doppler.rv.fit( spectrum, verbose=verbose, mcmc=mcmc, figfile=figfile, cornername=cornername, retpmodels=retpmodels, plot=plot, tweak=tweak, usepeak=usepeak, maxvel=maxvel) except: log.exception( f"Exception occurred on Doppler on {path} with task instance {instance}" ) failures.append(instance.pk) continue else: # Write the output to the database. results = prepare_results(summary) create_task_output(instance, astradb.Doppler, **results) if len(failures) > 0: log.warning( f"There were {len(failures)} Doppler failures out of a total {len(pks)} executions." ) log.warning(f"Failed primary keys include: {failures}") log.warning(f"Raising last exception to indicate failure in pipeline.") raise
def group_results_by_instance(param, param_err, output_meta, spectrum_meta, Ns): """ Group FERRE results together into a list of dictionaries where the size of the outputs matches the size of the input spectra loaded for each task instance. :param param: The array of output parameters from FERRE. :param param_err: The estimated errors on the output parameters from FERRE. :param output_meta: A metadata dictionary output by FERRE. :param spectrum_meta: A list of dictionaries of spectrum metadata for each instance. :param Ns: A list of integers indicating the number of spectra that were loaded with each instance (e.g., `sum(Ns)` should equal `param.shape[0]`). :returns: A list of dictionaries that contain results for each instance. """ results = [] common_results = dict(frozen_parameters=output_meta["frozen_parameters"]) parameter_names = tuple( map(utils.sanitise_parameter_name, output_meta["parameter_names"])) log.debug(f"Ns: {Ns}") si = 0 for i, N in enumerate(Ns): if N == 0: results.append((None, None)) continue sliced = slice(si, si + N) result = dict(snr=spectrum_meta[i]["snr"], log_snr_sq=output_meta["log_snr_sq"][sliced], log_chisq_fit=output_meta["log_chisq_fit"][sliced], bitmask_flag=output_meta["bitmask_flag"][sliced]) data = {} for key in ("wavelength", "flux", "sigma", "normalized_model_flux", "continuum"): data[key] = output_meta[key][sliced] # Same for all results in this group, but we include it for convenience. # TODO: Consider sending back something else instead of mask array. data["mask"] = output_meta["mask"] for j, parameter_name in enumerate(parameter_names): result[f"{parameter_name}"] = param[sliced][:, j] result[f"u_{parameter_name}"] = param_err[sliced][:, j] result[f"initial_{parameter_name}"] = output_meta[ "initial_parameters"][sliced][:, j] result[f"frozen_{parameter_name}"] = output_meta[ "frozen_parameters"][parameter_name] results.append((result, data)) si += N return results
:param weight_decay: (optional) The weight decay to use during training (default: 0) :param learning_rate: (optional) The learning rate to use during training (default: 0.001). """ # Expand the training set path for the purposes of hashing. training_set_path = os.path.expanduser( os.path.expandvars(training_set_path)) param_dict = OrderedDict() for arg in signature(get_model_path).parameters.keys(): if arg != "kwargs": param_dict[arg] = locals()[arg] log.debug(f"Hashing {param_dict} for The Payne model path") param_hash = hashify(param_dict) basename = f"thepayne_model_{param_hash}.pkl" path = os.path.join(get_base_output_path(), "thepayne", basename) os.makedirs(os.path.dirname(path), exist_ok=True) return path def train_model(training_set_path, output_model_path=None, num_epochs=100_000, num_neurons=300, weight_decay=0.0, learning_rate=0.001, **kwargs):
def estimate_stellar_labels(pks, default_num_uncertainty_draws=100, default_large_error=1e10): """ Estimate the stellar parameters for APOGEE ApStar observations, where task instances have been created with the given primary keys (`pks`). :param pks: The primary keys of task instances that include information about what ApStar observation to load. :param default_num_uncertainty_draws: [optional] The number of random draws to make of the flux uncertainties, which will be propagated into the estimate of the stellar parameter uncertainties (default: 100). :param default_large_error: [optional] An arbitrarily large error value to assign to bad pixels (default: 1e10). """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") log.info(f"Running APOGEENet on device {device} with:") log.info(f"\tpks: {pks}") log.debug( f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'") log.debug(f"Using torch version {torch.__version__} in {torch.__path__}") models = {} pks = deserialize_pks(pks, flatten=True) total = len(pks) log.info(f"There are {total} primary keys to process: {pks}") for instance, path, spectrum in tqdm(prepare_data(pks), total=total): if spectrum is None: continue model_path = instance.parameters["model_path"] # Load the model. try: model = models[model_path] except KeyError: log.info(f"Loaded model from {model_path}") models[model_path] = model = Model(model_path, device) N, P = spectrum.flux.shape # Build metadata array. metadata_keys, metadata, metadata_norm = get_metadata(spectrum) flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape( (N, 1, P)) meta = np.tile(metadata_norm, N).reshape((N, -1)) flux = torch.from_numpy(flux).to(device) meta = torch.from_numpy(meta).to(device) with torch.set_grad_enabled(False): predictions = model.predict_spectra(flux, meta) if device != "cpu": predictions = predictions.cpu().data.numpy() # Replace infinites with non-finite. predictions[~np.isfinite(predictions)] = np.nan # Create results array. log_g, log_teff, fe_h = predictions.T teff = 10**log_teff result = dict( snr=spectrum.meta["snr"], teff=teff.tolist(), logg=log_g.tolist(), fe_h=fe_h.tolist(), ) num_uncertainty_draws = int( instance.parameters.get("num_uncertainty_draws", default_num_uncertainty_draws)) if num_uncertainty_draws > 0: large_error = float( instance.parameters.get("large_error", default_large_error)) flux_error = np.nan_to_num( spectrum.uncertainty.array**-0.5).astype(np.float32).reshape( (N, 1, P)) median_error = 5 * np.median(flux_error, axis=(1, 2)) for j, value in enumerate(median_error): bad_pixel = (flux_error[j] == large_error) | (flux_error[j] >= value) flux_error[j][bad_pixel] = value flux_error = torch.from_numpy(flux_error).to(device) inputs = torch.randn((num_uncertainty_draws, N, 1, P), device=device) * flux_error + flux inputs = inputs.reshape((num_uncertainty_draws * N, 1, P)) meta_error = meta.repeat(num_uncertainty_draws, 1) with torch.set_grad_enabled(False): draws = model.predict_spectra(inputs, meta_error) if device != "cpu": draws = draws.cpu().data.numpy() draws = draws.reshape((num_uncertainty_draws, N, -1)) # Need to put the log(teffs) to teffs before calculating std_dev draws[:, :, 1] = 10**draws[:, :, 1] median_draw_predictions = np.nanmedian(draws, axis=0) std_draw_predictions = np.nanstd(draws, axis=0) log_g_median, teff_median, fe_h_median = median_draw_predictions.T log_g_std, teff_std, fe_h_std = std_draw_predictions.T result.update(_teff_median=teff_median.tolist(), _logg_median=log_g_median.tolist(), _fe_h_median=fe_h_median.tolist(), u_teff=teff_std.tolist(), u_logg=log_g_std.tolist(), u_fe_h=fe_h_std.tolist()) else: median_draw_predictions, std_draw_predictions = (None, None) # Add the bitmask flag. bitmask_flag = create_bitmask( predictions, median_draw_predictions=median_draw_predictions, std_draw_predictions=std_draw_predictions) result.update(bitmask_flag=bitmask_flag.tolist()) # Write the result to database. create_task_output(instance, astradb.ApogeeNet, **result) log.info(f"Completed processing of {total} primary keys")
def _select_training_set_data_from_database(label_columns, filter_args=None, filter_func=None, limit=None, **kwargs): label_columns = list(label_columns) label_names = [column.key for column in label_columns] L = len(label_names) if filter_func is None: filter_func = lambda *_, **__: True # Get the label names. log.info(f"Querying for label names {label_names} from {label_columns}") # Figure out what other columns we will need to identify the input file. for column in label_columns: try: primary_parent = column.class_ except AttributeError: continue else: break else: raise ValueError( "Can't get primary parent. are you labelling every column?") log.debug(f"Identified primary parent table as {primary_parent}") if primary_parent == catalogdb.SDSSApogeeAllStarMergeR13: log.debug( f"Adding columns and setting data_model_func for {primary_parent}") additional_columns = [ catalogdb.SDSSDR16ApogeeStar.apstar_version.label("apstar"), catalogdb.SDSSDR16ApogeeStar.field, catalogdb.SDSSDR16ApogeeStar.apogee_id.label("obj"), catalogdb.SDSSDR16ApogeeStar.file, catalogdb.SDSSDR16ApogeeStar.telescope, # Things that we might want for filtering on. catalogdb.SDSSDR16ApogeeStar.snr ] columns = label_columns + additional_columns q = session.query(*columns).join( catalogdb.SDSSApogeeAllStarMergeR13, func.trim(catalogdb.SDSSApogeeAllStarMergeR13.apstar_ids) == catalogdb.SDSSDR16ApogeeStar.apstar_id) data_model_func = lambda apstar, field, obj, filename, telescope, *_, : { "release": "DR16", "filetype": "apStar", "apstar": apstar, "field": field, "obj": obj, "prefix": filename[:2], "telescope": telescope, "apred": filename.split("-")[1] } else: raise NotImplementedError( f"Cannot intelligently figure out what data model keywords will be necessary." ) if filter_args is not None: q = q.filter(*filter_args) if limit is not None: q = q.limit(limit) log.debug(f"Querying {q}") data_model_identifiers = [] labels = {label_name: [] for label_name in label_names} for i, row in enumerate(tqdm(q.yield_per(1), total=q.count())): if not filter_func(*row): continue for label_name, value in zip(label_names, row[:L]): if not np.isfinite(value) or value is None: log.warning( f"Label {label_name} in {i} row is not finite: {value}!") labels[label_name].append(value) data_model_identifiers.append(data_model_func(*row[L:])) return (labels, data_model_identifiers)
def _parse_names_and_initial_and_frozen_parameters( names, initial_parameters, frozen_parameters, headers, flux, clip_initial_parameters_to_boundary_edges=True, clip_epsilon_percent=1, **kwargs): # Read the labels from the first header path parameter_names = headers["LABEL"] # Need the number of spectra, which we will take from the flux array. N = len(flux) mid_point = _grid_mid_point(headers) parsed_initial_parameters = np.tile(mid_point, N).reshape((N, -1)) log.debug(f"parsed initial parameters before {parsed_initial_parameters}") compare_parameter_names = list( map(sanitise_parameter_name, parameter_names)) log.debug(f"Initial parameters passed for parsing {initial_parameters}") if initial_parameters is not None: log.debug(f"Comparison names {compare_parameter_names}") for i, (parameter_name, values) in enumerate(initial_parameters.items()): spn = sanitise_parameter_name(parameter_name) log.debug(f"{parameter_name} {values} {spn}") try: index = compare_parameter_names.index(spn) except ValueError: log.warning( f"Ignoring initial parameters for {parameter_name} as they are not in {parameter_names}" ) log.debug( f"Nothing matched for {spn} {parameter_name} {compare_parameter_names}" ) else: log.debug(f"Matched to index {index}") # Replace non-finite values with the mid point. finite = np.isfinite(values) if not np.all(finite): log.warning( f"Missing or non-finite initial values given for {parameter_name}. Defaulting to the grid mid-point." ) values = np.array(values) values[~finite] = mid_point[index] log.debug(f"values are {values} {type(values[0])} {finite}") parsed_initial_parameters[:, index] = values log.debug(f"parsed initial parameters after {parsed_initial_parameters}") kwds = dict() frozen_parameters = (frozen_parameters or dict()) if frozen_parameters: # Ensure we have a dict-like thing. if isinstance(frozen_parameters, (list, tuple, np.ndarray)): frozen_parameters = { sanitise_parameter_name(k): True for k in frozen_parameters } elif isinstance(frozen_parameters, dict): # Exclude things that have boolean False. frozen_parameters = { sanitise_parameter_name(k): v for k, v in frozen_parameters.items() \ if not (isinstance(v, bool) and not v) } else: raise TypeError( f"frozen_parameters must be list-like or dict-like") unknown_parameters = set(frozen_parameters).difference( compare_parameter_names) if unknown_parameters: raise ValueError( f"unknown parameter(s): {unknown_parameters} (available: {parameter_names})" ) indices = [ i for i, pn in enumerate(compare_parameter_names, start=1) if pn not in frozen_parameters ] if len(indices) == 0: raise ValueError(f"all parameters frozen?!") # Over-ride initial values with the frozen ones if given. for parameter_name, value in frozen_parameters.items(): if not isinstance(value, bool): log.debug( f"Over-writing initial values for {parameter_name} with frozen value of {value}" ) zero_index = compare_parameter_names.index(parameter_name) parsed_initial_parameters[:, zero_index] = value else: # No frozen parameters. indices = 1 + np.arange(len(parameter_names), dtype=int) # Build a frozen parameters dict for result metadata. parsed_frozen_parameters = { pn: (pn in frozen_parameters) for pn in compare_parameter_names } L = len(indices) kwds.update( ndim=headers["N_OF_DIM"], nov=L, indv=" ".join([f"{i:.0f}" for i in indices]), # We will always provide an initial guess, even if it is the grid mid point. init=0, indini=" ".join(["1"] * L)) # Now deal with names. if names is None: names = [f"{i:.0f}" for i in range(len(parsed_initial_parameters))] else: if len(names) != len(parsed_initial_parameters): raise ValueError( f"names and initial parameters does not match ({len(names)} != {len(parsed_initial_parameters)})" ) # Let's check the initial values are all within the grid boundaries. lower_limit, upper_limit = _get_grid_limits(headers) try: _check_initial_parameters_within_grid_limits(parsed_initial_parameters, lower_limit, upper_limit, parameter_names) except ValueError as e: log.exception( f"Exception when checking initial parameters within grid boundaries:" ) log.critical(e, exc_info=True) if clip_initial_parameters_to_boundary_edges: log.info( f"Clipping initial parameters to boundary edges (use clip_initial_parameters_to_boundary_edges=False to raise exception instead)" ) clip = clip_epsilon_percent * (upper_limit - lower_limit) / 100. parsed_initial_parameters = np.round( np.clip(parsed_initial_parameters, lower_limit + clip, upper_limit - clip), 3) else: raise return (kwds, names, parsed_initial_parameters, parsed_frozen_parameters)
def execute_by_slurm(self, context, bash_command, directory=None, poke_interval=60): uid = str(uuid.uuid4())[:8] label = ".".join([ context["dag"].dag_id, context["task"].task_id, context["execution_date"].strftime('%Y-%m-%d'), # run_id is None if triggered by command line uid ]) if len(label) > 64: log.warning( f"Truncating Slurm label ({label}) to 64 characters: {label[:64]}" ) label = label[:64] self._slurm_label = label # It's bad practice to import here, but the slurm package is # not easily installable outside of Utah, and is not a "must-have" # requirement. from slurm import queue # TODO: HACK to be able to use local astra installation while in development if bash_command.startswith("astra "): bash_command = f"/uufs/chpc.utah.edu/common/home/u6020307/.local/bin/astra {bash_command[6:]}" slurm_kwargs = (self.slurm_kwargs or dict()) log.info( f"Submitting Slurm job {label} with command:\n\t{bash_command}\nAnd Slurm keyword arguments: {slurm_kwargs}" ) q = queue(verbose=True) q.create(label=label, dir=directory, **slurm_kwargs) q.append(bash_command) try: q.commit(hard=True, submit=True) except CalledProcessError as e: log.exception( f"Exception occurred when committing Slurm job with output:\n{e.output}" ) raise log.info( f"Slurm job submitted with {q.key} and keywords {slurm_kwargs}") log.info(f"\tJob directory: {directory or q.job_dir}") stdout_path = os.path.join(directory or q.job_dir, f"{label}_01.o") stderr_path = os.path.join(directory or q.job_dir, f"{label}_01.e") # Now we wait until the Slurm job is complete. t_submitted, t_started = (time(), None) while 100 > q.get_percent_complete(): sleep(poke_interval) t = time() - t_submitted if not os.path.exists(stderr_path) and not os.path.exists( stdout_path): log.info( f"Waiting on job {q.key} to start (elapsed: {t / 60:.0f} min)" ) else: # Check if this is the first time it has started. if t_started is None: t_started = time() log.debug( f"Recording job {q.key} as starting at {t_started} (took {t / 60:.0f} min to start)" ) log.info( f"Waiting on job {q.key} to finish (elapsed: {t / 60:.0f} min)" ) # Open last line of stdout path? # If this has been going much longer than the walltime, then something went wrong. # TODO: Check on the status of the job from Slurm. log.info( f"Job {q.key} in {q.job_dir} is complete after {(time() - t_submitted)/60:.0f} minutes." ) with open(stderr_path, "r", newline="\n") as fp: stderr = fp.read() log.info(f"Contents of {stderr_path}:\n{stderr}") with open(stdout_path, "r", newline="\n") as fp: stdout = fp.read() log.info(f"Contents of {stdout_path}:\n{stdout}") # TODO: Better parsing for critical errors. if "Error" in stdout.rstrip().split("\n")[-1] \ or "Error" in stderr.rstrip().split("\n")[-1]: raise RuntimeError(f"detected exception at task end-point") # TODO: Get exit codes from squeue return None